From: Andrei Tatar Date: Mon, 17 Jul 2023 16:25:57 +0000 (+0200) Subject: build: Make clang compatibility explicit X-Git-Tag: RELEASE-0.14.0~8 X-Git-Url: http://xenbits.xensource.com/gitweb?a=commitdiff_plain;h=ddf533cd849576de9bc9647525be3290363b484d;p=unikraft%2Flibs%2Fintel-intrinsics.git build: Make clang compatibility explicit This change makes explicit that this library only supports clang by renaming the include directory and only adding it to the include path if compiling with clang. A future change may thus explicitly add GCC-compatible headers. Signed-off-by: Andrei Tatar Reviewed-by: Maria Sfiraiala Reviewed-by: Radu Nichita Approved-by: Razvan Deaconescu Tested-by: Unikraft CI GitHub-Closes: #3 --- diff --git a/Makefile.uk b/Makefile.uk index 0131610..b8cc585 100644 --- a/Makefile.uk +++ b/Makefile.uk @@ -41,6 +41,6 @@ $(eval $(call addlib_s,libintel_intrinsics,$(CONFIG_LIBINTEL_INTRINSICS))) # Library includes ################################################################################ ifeq ($(CONFIG_LIBINTEL_INTRINSICS),y) -CINCLUDES-$(CONFIG_LIBINTEL_INTRINSICS) += -I$(LIBINTEL_INTRINSICS_BASE)/include -CXXINCLUDES-$(CONFIG_LIBINTEL_INTRINSICS) += -I$(LIBINTEL_INTRINSICS_BASE)/include +CINCLUDES-$(call have_clang) += -I$(LIBINTEL_INTRINSICS_BASE)/include-llvm +CXXINCLUDES-$(call have_clang) += -I$(LIBINTEL_INTRINSICS_BASE)/include-llvm endif diff --git a/include-llvm/__wmmintrin_aes.h b/include-llvm/__wmmintrin_aes.h new file mode 100644 index 0000000..3010b38 --- /dev/null +++ b/include-llvm/__wmmintrin_aes.h @@ -0,0 +1,140 @@ +/*===---- __wmmintrin_aes.h - AES intrinsics -------------------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __WMMINTRIN_H +#error "Never use <__wmmintrin_aes.h> directly; include instead." +#endif + +#ifndef __WMMINTRIN_AES_H +#define __WMMINTRIN_AES_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("aes"), __min_vector_width__(128))) + +/// Performs a single round of AES encryption using the Equivalent +/// Inverse Cipher, transforming the state value from the first source +/// operand using a 128-bit round key value contained in the second source +/// operand, and writes the result to the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VAESENC instruction. +/// +/// \param __V +/// A 128-bit integer vector containing the state value. +/// \param __R +/// A 128-bit integer vector containing the round key value. +/// \returns A 128-bit integer vector containing the encrypted value. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_aesenc_si128(__m128i __V, __m128i __R) +{ + return (__m128i)__builtin_ia32_aesenc128((__v2di)__V, (__v2di)__R); +} + +/// Performs the final round of AES encryption using the Equivalent +/// Inverse Cipher, transforming the state value from the first source +/// operand using a 128-bit round key value contained in the second source +/// operand, and writes the result to the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VAESENCLAST instruction. +/// +/// \param __V +/// A 128-bit integer vector containing the state value. +/// \param __R +/// A 128-bit integer vector containing the round key value. +/// \returns A 128-bit integer vector containing the encrypted value. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_aesenclast_si128(__m128i __V, __m128i __R) +{ + return (__m128i)__builtin_ia32_aesenclast128((__v2di)__V, (__v2di)__R); +} + +/// Performs a single round of AES decryption using the Equivalent +/// Inverse Cipher, transforming the state value from the first source +/// operand using a 128-bit round key value contained in the second source +/// operand, and writes the result to the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VAESDEC instruction. +/// +/// \param __V +/// A 128-bit integer vector containing the state value. +/// \param __R +/// A 128-bit integer vector containing the round key value. +/// \returns A 128-bit integer vector containing the decrypted value. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_aesdec_si128(__m128i __V, __m128i __R) +{ + return (__m128i)__builtin_ia32_aesdec128((__v2di)__V, (__v2di)__R); +} + +/// Performs the final round of AES decryption using the Equivalent +/// Inverse Cipher, transforming the state value from the first source +/// operand using a 128-bit round key value contained in the second source +/// operand, and writes the result to the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VAESDECLAST instruction. +/// +/// \param __V +/// A 128-bit integer vector containing the state value. +/// \param __R +/// A 128-bit integer vector containing the round key value. +/// \returns A 128-bit integer vector containing the decrypted value. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_aesdeclast_si128(__m128i __V, __m128i __R) +{ + return (__m128i)__builtin_ia32_aesdeclast128((__v2di)__V, (__v2di)__R); +} + +/// Applies the AES InvMixColumns() transformation to an expanded key +/// contained in the source operand, and writes the result to the +/// destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VAESIMC instruction. +/// +/// \param __V +/// A 128-bit integer vector containing the expanded key. +/// \returns A 128-bit integer vector containing the transformed value. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_aesimc_si128(__m128i __V) +{ + return (__m128i)__builtin_ia32_aesimc128((__v2di)__V); +} + +/// Generates a round key for AES encryption, operating on 128-bit data +/// specified in the first source operand and using an 8-bit round constant +/// specified by the second source operand, and writes the result to the +/// destination. +/// +/// \headerfile +/// +/// \code +/// __m128i _mm_aeskeygenassist_si128(__m128i C, const int R); +/// \endcode +/// +/// This intrinsic corresponds to the AESKEYGENASSIST instruction. +/// +/// \param C +/// A 128-bit integer vector that is used to generate the AES encryption key. +/// \param R +/// An 8-bit round constant used to generate the AES encryption key. +/// \returns A 128-bit round key for AES encryption. +#define _mm_aeskeygenassist_si128(C, R) \ + ((__m128i)__builtin_ia32_aeskeygenassist128((__v2di)(__m128i)(C), (int)(R))) + +#undef __DEFAULT_FN_ATTRS + +#endif /* __WMMINTRIN_AES_H */ diff --git a/include-llvm/__wmmintrin_pclmul.h b/include-llvm/__wmmintrin_pclmul.h new file mode 100644 index 0000000..fef4b93 --- /dev/null +++ b/include-llvm/__wmmintrin_pclmul.h @@ -0,0 +1,48 @@ +/*===---- __wmmintrin_pclmul.h - PCMUL intrinsics ---------------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __WMMINTRIN_H +#error "Never use <__wmmintrin_pclmul.h> directly; include instead." +#endif + +#ifndef __WMMINTRIN_PCLMUL_H +#define __WMMINTRIN_PCLMUL_H + +/// Multiplies two 64-bit integer values, which are selected from source +/// operands using the immediate-value operand. The multiplication is a +/// carry-less multiplication, and the 128-bit integer product is stored in +/// the destination. +/// +/// \headerfile +/// +/// \code +/// __m128i _mm_clmulepi64_si128(__m128i __X, __m128i __Y, const int __I); +/// \endcode +/// +/// This intrinsic corresponds to the VPCLMULQDQ instruction. +/// +/// \param __X +/// A 128-bit vector of [2 x i64] containing one of the source operands. +/// \param __Y +/// A 128-bit vector of [2 x i64] containing one of the source operands. +/// \param __I +/// An immediate value specifying which 64-bit values to select from the +/// operands. Bit 0 is used to select a value from operand \a __X, and bit +/// 4 is used to select a value from operand \a __Y: \n +/// Bit[0]=0 indicates that bits[63:0] of operand \a __X are used. \n +/// Bit[0]=1 indicates that bits[127:64] of operand \a __X are used. \n +/// Bit[4]=0 indicates that bits[63:0] of operand \a __Y are used. \n +/// Bit[4]=1 indicates that bits[127:64] of operand \a __Y are used. +/// \returns The 128-bit integer vector containing the result of the carry-less +/// multiplication of the selected 64-bit values. +#define _mm_clmulepi64_si128(X, Y, I) \ + ((__m128i)__builtin_ia32_pclmulqdq128((__v2di)(__m128i)(X), \ + (__v2di)(__m128i)(Y), (char)(I))) + +#endif /* __WMMINTRIN_PCLMUL_H */ diff --git a/include-llvm/adxintrin.h b/include-llvm/adxintrin.h new file mode 100644 index 0000000..72b9ed0 --- /dev/null +++ b/include-llvm/adxintrin.h @@ -0,0 +1,72 @@ +/*===---- adxintrin.h - ADX intrinsics -------------------------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __ADXINTRIN_H +#define __ADXINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__)) + +/* Intrinsics that are available only if __ADX__ defined */ +static __inline unsigned char __attribute__((__always_inline__, __nodebug__, __target__("adx"))) +_addcarryx_u32(unsigned char __cf, unsigned int __x, unsigned int __y, + unsigned int *__p) +{ + return __builtin_ia32_addcarryx_u32(__cf, __x, __y, __p); +} + +#ifdef __x86_64__ +static __inline unsigned char __attribute__((__always_inline__, __nodebug__, __target__("adx"))) +_addcarryx_u64(unsigned char __cf, unsigned long long __x, + unsigned long long __y, unsigned long long *__p) +{ + return __builtin_ia32_addcarryx_u64(__cf, __x, __y, __p); +} +#endif + +/* Intrinsics that are also available if __ADX__ undefined */ +static __inline unsigned char __DEFAULT_FN_ATTRS +_addcarry_u32(unsigned char __cf, unsigned int __x, unsigned int __y, + unsigned int *__p) +{ + return __builtin_ia32_addcarryx_u32(__cf, __x, __y, __p); +} + +#ifdef __x86_64__ +static __inline unsigned char __DEFAULT_FN_ATTRS +_addcarry_u64(unsigned char __cf, unsigned long long __x, + unsigned long long __y, unsigned long long *__p) +{ + return __builtin_ia32_addcarryx_u64(__cf, __x, __y, __p); +} +#endif + +static __inline unsigned char __DEFAULT_FN_ATTRS +_subborrow_u32(unsigned char __cf, unsigned int __x, unsigned int __y, + unsigned int *__p) +{ + return __builtin_ia32_subborrow_u32(__cf, __x, __y, __p); +} + +#ifdef __x86_64__ +static __inline unsigned char __DEFAULT_FN_ATTRS +_subborrow_u64(unsigned char __cf, unsigned long long __x, + unsigned long long __y, unsigned long long *__p) +{ + return __builtin_ia32_subborrow_u64(__cf, __x, __y, __p); +} +#endif + +#undef __DEFAULT_FN_ATTRS + +#endif /* __ADXINTRIN_H */ diff --git a/include-llvm/ammintrin.h b/include-llvm/ammintrin.h new file mode 100644 index 0000000..1af2096 --- /dev/null +++ b/include-llvm/ammintrin.h @@ -0,0 +1,183 @@ +/*===---- ammintrin.h - SSE4a intrinsics -----------------------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __AMMINTRIN_H +#define __AMMINTRIN_H + +#if !defined(__i386__) && !defined(__x86_64__) +#error "This header is only meant to be used on x86 and x64 architecture" +#endif + +#include + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4a"), __min_vector_width__(128))) + +/// Extracts the specified bits from the lower 64 bits of the 128-bit +/// integer vector operand at the index \a idx and of the length \a len. +/// +/// \headerfile +/// +/// \code +/// __m128i _mm_extracti_si64(__m128i x, const int len, const int idx); +/// \endcode +/// +/// This intrinsic corresponds to the EXTRQ instruction. +/// +/// \param x +/// The value from which bits are extracted. +/// \param len +/// Bits [5:0] specify the length; the other bits are ignored. If bits [5:0] +/// are zero, the length is interpreted as 64. +/// \param idx +/// Bits [5:0] specify the index of the least significant bit; the other +/// bits are ignored. If the sum of the index and length is greater than 64, +/// the result is undefined. If the length and index are both zero, bits +/// [63:0] of parameter \a x are extracted. If the length is zero but the +/// index is non-zero, the result is undefined. +/// \returns A 128-bit integer vector whose lower 64 bits contain the bits +/// extracted from the source operand. +#define _mm_extracti_si64(x, len, idx) \ + ((__m128i)__builtin_ia32_extrqi((__v2di)(__m128i)(x), \ + (char)(len), (char)(idx))) + +/// Extracts the specified bits from the lower 64 bits of the 128-bit +/// integer vector operand at the index and of the length specified by +/// \a __y. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the EXTRQ instruction. +/// +/// \param __x +/// The value from which bits are extracted. +/// \param __y +/// Specifies the index of the least significant bit at [13:8] and the +/// length at [5:0]; all other bits are ignored. If bits [5:0] are zero, the +/// length is interpreted as 64. If the sum of the index and length is +/// greater than 64, the result is undefined. If the length and index are +/// both zero, bits [63:0] of parameter \a __x are extracted. If the length +/// is zero but the index is non-zero, the result is undefined. +/// \returns A 128-bit vector whose lower 64 bits contain the bits extracted +/// from the source operand. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_extract_si64(__m128i __x, __m128i __y) +{ + return (__m128i)__builtin_ia32_extrq((__v2di)__x, (__v16qi)__y); +} + +/// Inserts bits of a specified length from the source integer vector +/// \a y into the lower 64 bits of the destination integer vector \a x at +/// the index \a idx and of the length \a len. +/// +/// \headerfile +/// +/// \code +/// __m128i _mm_inserti_si64(__m128i x, __m128i y, const int len, +/// const int idx); +/// \endcode +/// +/// This intrinsic corresponds to the INSERTQ instruction. +/// +/// \param x +/// The destination operand where bits will be inserted. The inserted bits +/// are defined by the length \a len and by the index \a idx specifying the +/// least significant bit. +/// \param y +/// The source operand containing the bits to be extracted. The extracted +/// bits are the least significant bits of operand \a y of length \a len. +/// \param len +/// Bits [5:0] specify the length; the other bits are ignored. If bits [5:0] +/// are zero, the length is interpreted as 64. +/// \param idx +/// Bits [5:0] specify the index of the least significant bit; the other +/// bits are ignored. If the sum of the index and length is greater than 64, +/// the result is undefined. If the length and index are both zero, bits +/// [63:0] of parameter \a y are inserted into parameter \a x. If the length +/// is zero but the index is non-zero, the result is undefined. +/// \returns A 128-bit integer vector containing the original lower 64-bits of +/// destination operand \a x with the specified bitfields replaced by the +/// lower bits of source operand \a y. The upper 64 bits of the return value +/// are undefined. +#define _mm_inserti_si64(x, y, len, idx) \ + ((__m128i)__builtin_ia32_insertqi((__v2di)(__m128i)(x), \ + (__v2di)(__m128i)(y), \ + (char)(len), (char)(idx))) + +/// Inserts bits of a specified length from the source integer vector +/// \a __y into the lower 64 bits of the destination integer vector \a __x +/// at the index and of the length specified by \a __y. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the INSERTQ instruction. +/// +/// \param __x +/// The destination operand where bits will be inserted. The inserted bits +/// are defined by the length and by the index of the least significant bit +/// specified by operand \a __y. +/// \param __y +/// The source operand containing the bits to be extracted. The extracted +/// bits are the least significant bits of operand \a __y with length +/// specified by bits [69:64]. These are inserted into the destination at the +/// index specified by bits [77:72]; all other bits are ignored. If bits +/// [69:64] are zero, the length is interpreted as 64. If the sum of the +/// index and length is greater than 64, the result is undefined. If the +/// length and index are both zero, bits [63:0] of parameter \a __y are +/// inserted into parameter \a __x. If the length is zero but the index is +/// non-zero, the result is undefined. +/// \returns A 128-bit integer vector containing the original lower 64-bits of +/// destination operand \a __x with the specified bitfields replaced by the +/// lower bits of source operand \a __y. The upper 64 bits of the return +/// value are undefined. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_insert_si64(__m128i __x, __m128i __y) +{ + return (__m128i)__builtin_ia32_insertq((__v2di)__x, (__v2di)__y); +} + +/// Stores a 64-bit double-precision value in a 64-bit memory location. +/// To minimize caching, the data is flagged as non-temporal (unlikely to be +/// used again soon). +/// +/// \headerfile +/// +/// This intrinsic corresponds to the MOVNTSD instruction. +/// +/// \param __p +/// The 64-bit memory location used to store the register value. +/// \param __a +/// The 64-bit double-precision floating-point register value to be stored. +static __inline__ void __DEFAULT_FN_ATTRS +_mm_stream_sd(double *__p, __m128d __a) +{ + __builtin_ia32_movntsd(__p, (__v2df)__a); +} + +/// Stores a 32-bit single-precision floating-point value in a 32-bit +/// memory location. To minimize caching, the data is flagged as +/// non-temporal (unlikely to be used again soon). +/// +/// \headerfile +/// +/// This intrinsic corresponds to the MOVNTSS instruction. +/// +/// \param __p +/// The 32-bit memory location used to store the register value. +/// \param __a +/// The 32-bit single-precision floating-point register value to be stored. +static __inline__ void __DEFAULT_FN_ATTRS +_mm_stream_ss(float *__p, __m128 __a) +{ + __builtin_ia32_movntss(__p, (__v4sf)__a); +} + +#undef __DEFAULT_FN_ATTRS + +#endif /* __AMMINTRIN_H */ diff --git a/include-llvm/amxintrin.h b/include-llvm/amxintrin.h new file mode 100644 index 0000000..4940666 --- /dev/null +++ b/include-llvm/amxintrin.h @@ -0,0 +1,494 @@ +/*===--------------- amxintrin.h - AMX intrinsics -*- C/C++ -*---------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===------------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif /* __IMMINTRIN_H */ + +#ifndef __AMXINTRIN_H +#define __AMXINTRIN_H +#ifdef __x86_64__ + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS_TILE \ + __attribute__((__always_inline__, __nodebug__, __target__("amx-tile"))) +#define __DEFAULT_FN_ATTRS_INT8 \ + __attribute__((__always_inline__, __nodebug__, __target__("amx-int8"))) +#define __DEFAULT_FN_ATTRS_BF16 \ + __attribute__((__always_inline__, __nodebug__, __target__("amx-bf16"))) + +/// Load tile configuration from a 64-byte memory location specified by +/// "mem_addr". The tile configuration includes the tile type palette, the +/// number of bytes per row, and the number of rows. If the specified +/// palette_id is zero, that signifies the init state for both the tile +/// config and the tile data, and the tiles are zeroed. Any invalid +/// configurations will result in #GP fault. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the LDTILECFG instruction. +/// +/// \param __config +/// A pointer to 512-bits configuration +static __inline__ void __DEFAULT_FN_ATTRS_TILE +_tile_loadconfig(const void *__config) { + __builtin_ia32_tile_loadconfig(__config); +} + +/// Stores the current tile configuration to a 64-byte memory location +/// specified by "mem_addr". The tile configuration includes the tile type +/// palette, the number of bytes per row, and the number of rows. If tiles +/// are not configured, all zeroes will be stored to memory. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the STTILECFG instruction. +/// +/// \param __config +/// A pointer to 512-bits configuration +static __inline__ void __DEFAULT_FN_ATTRS_TILE +_tile_storeconfig(void *__config) { + __builtin_ia32_tile_storeconfig(__config); +} + +/// Release the tile configuration to return to the init state, which +/// releases all storage it currently holds. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TILERELEASE instruction. +static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) { + __builtin_ia32_tilerelease(); +} + +/// Load tile rows from memory specifieid by "base" address and "stride" into +/// destination tile "dst" using the tile configuration previously configured +/// via "_tile_loadconfig". +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TILELOADD instruction. +/// +/// \param dst +/// A destination tile. Max size is 1024 Bytes. +/// \param base +/// A pointer to base address. +/// \param stride +/// The stride between the rows' data to be loaded in memory. +#define _tile_loadd(dst, base, stride) \ + __builtin_ia32_tileloadd64((dst), ((const void *)(base)), \ + (__SIZE_TYPE__)(stride)) + +/// Load tile rows from memory specifieid by "base" address and "stride" into +/// destination tile "dst" using the tile configuration previously configured +/// via "_tile_loadconfig". This intrinsic provides a hint to the implementation +/// that the data will likely not be reused in the near future and the data +/// caching can be optimized accordingly. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TILELOADDT1 instruction. +/// +/// \param dst +/// A destination tile. Max size is 1024 Bytes. +/// \param base +/// A pointer to base address. +/// \param stride +/// The stride between the rows' data to be loaded in memory. +#define _tile_stream_loadd(dst, base, stride) \ + __builtin_ia32_tileloaddt164((dst), ((const void *)(base)), \ + (__SIZE_TYPE__)(stride)) + +/// Store the tile specified by "src" to memory specifieid by "base" address and +/// "stride" using the tile configuration previously configured via +/// "_tile_loadconfig". +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TILESTORED instruction. +/// +/// \param dst +/// A destination tile. Max size is 1024 Bytes. +/// \param base +/// A pointer to base address. +/// \param stride +/// The stride between the rows' data to be stored in memory. +#define _tile_stored(dst, base, stride) \ + __builtin_ia32_tilestored64((dst), ((void *)(base)), (__SIZE_TYPE__)(stride)) + +/// Zero the tile specified by "tdest". +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TILEZERO instruction. +/// +/// \param tile +/// The destination tile to be zero. Max size is 1024 Bytes. +#define _tile_zero(tile) __builtin_ia32_tilezero((tile)) + +/// Compute dot-product of bytes in tiles with a source/destination accumulator. +/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with +/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit +/// results. Sum these 4 results with the corresponding 32-bit integer in "dst", +/// and store the 32-bit result back to tile "dst". +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TDPBSSD instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param src0 +/// The 1st source tile. Max size is 1024 Bytes. +/// \param src1 +/// The 2nd source tile. Max size is 1024 Bytes. +#define _tile_dpbssd(dst, src0, src1) \ + __builtin_ia32_tdpbssd((dst), (src0), (src1)) + +/// Compute dot-product of bytes in tiles with a source/destination accumulator. +/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with +/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate +/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer +/// in "dst", and store the 32-bit result back to tile "dst". +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TDPBSUD instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param src0 +/// The 1st source tile. Max size is 1024 Bytes. +/// \param src1 +/// The 2nd source tile. Max size is 1024 Bytes. +#define _tile_dpbsud(dst, src0, src1) \ + __builtin_ia32_tdpbsud((dst), (src0), (src1)) + +/// Compute dot-product of bytes in tiles with a source/destination accumulator. +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with +/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit +/// results. Sum these 4 results with the corresponding 32-bit integer in "dst", +/// and store the 32-bit result back to tile "dst". +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TDPBUSD instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param src0 +/// The 1st source tile. Max size is 1024 Bytes. +/// \param src1 +/// The 2nd source tile. Max size is 1024 Bytes. +#define _tile_dpbusd(dst, src0, src1) \ + __builtin_ia32_tdpbusd((dst), (src0), (src1)) + +/// Compute dot-product of bytes in tiles with a source/destination accumulator. +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with +/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate +/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in +/// "dst", and store the 32-bit result back to tile "dst". +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TDPBUUD instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param src0 +/// The 1st source tile. Max size is 1024 Bytes. +/// \param src1 +/// The 2nd source tile. Max size is 1024 Bytes. +#define _tile_dpbuud(dst, src0, src1) \ + __builtin_ia32_tdpbuud((dst), (src0), (src1)) + +/// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and +/// src1, accumulating the intermediate single-precision (32-bit) floating-point +/// elements with elements in "dst", and store the 32-bit result back to tile +/// "dst". +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TDPBF16PS instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param src0 +/// The 1st source tile. Max size is 1024 Bytes. +/// \param src1 +/// The 2nd source tile. Max size is 1024 Bytes. +#define _tile_dpbf16ps(dst, src0, src1) \ + __builtin_ia32_tdpbf16ps((dst), (src0), (src1)) + +/// AMX tile register size can be configured, the maximum size is 16x64=1024 +/// bytes. Since there is no 2D type in llvm IR, we use vector type to +/// represent 2D tile and the fixed size is maximum amx tile register size. +typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64))); + +/// This is internal intrinsic. C/C++ user should avoid calling it directly. +static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8 +_tile_loadd_internal(unsigned short m, unsigned short n, const void *base, + __SIZE_TYPE__ stride) { + return __builtin_ia32_tileloadd64_internal(m, n, base, + (__SIZE_TYPE__)(stride)); +} + +/// This is internal intrinsic. C/C++ user should avoid calling it directly. +static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8 +_tile_loaddt1_internal(unsigned short m, unsigned short n, const void *base, + __SIZE_TYPE__ stride) { + return __builtin_ia32_tileloaddt164_internal(m, n, base, + (__SIZE_TYPE__)(stride)); +} + +/// This is internal intrinsic. C/C++ user should avoid calling it directly. +static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8 +_tile_dpbssd_internal(unsigned short m, unsigned short n, unsigned short k, + _tile1024i dst, _tile1024i src1, _tile1024i src2) { + return __builtin_ia32_tdpbssd_internal(m, n, k, dst, src1, src2); +} + +/// This is internal intrinsic. C/C++ user should avoid calling it directly. +static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8 +_tile_dpbsud_internal(unsigned short m, unsigned short n, unsigned short k, + _tile1024i dst, _tile1024i src1, _tile1024i src2) { + return __builtin_ia32_tdpbsud_internal(m, n, k, dst, src1, src2); +} + +/// This is internal intrinsic. C/C++ user should avoid calling it directly. +static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8 +_tile_dpbusd_internal(unsigned short m, unsigned short n, unsigned short k, + _tile1024i dst, _tile1024i src1, _tile1024i src2) { + return __builtin_ia32_tdpbusd_internal(m, n, k, dst, src1, src2); +} + +/// This is internal intrinsic. C/C++ user should avoid calling it directly. +static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8 +_tile_dpbuud_internal(unsigned short m, unsigned short n, unsigned short k, + _tile1024i dst, _tile1024i src1, _tile1024i src2) { + return __builtin_ia32_tdpbuud_internal(m, n, k, dst, src1, src2); +} + +/// This is internal intrinsic. C/C++ user should avoid calling it directly. +static __inline__ void __DEFAULT_FN_ATTRS_INT8 +_tile_stored_internal(unsigned short m, unsigned short n, void *base, + __SIZE_TYPE__ stride, _tile1024i tile) { + return __builtin_ia32_tilestored64_internal(m, n, base, + (__SIZE_TYPE__)(stride), tile); +} + +/// This is internal intrinsic. C/C++ user should avoid calling it directly. +static __inline__ _tile1024i __DEFAULT_FN_ATTRS_BF16 +_tile_dpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k, + _tile1024i dst, _tile1024i src1, _tile1024i src2) { + return __builtin_ia32_tdpbf16ps_internal(m, n, k, dst, src1, src2); +} + +/// This struct pack the shape and tile data together for user. We suggest +/// initializing the struct as early as possible, because compiler depends +/// on the shape information to do configure. The constant value is preferred +/// for optimization by compiler. +typedef struct __tile1024i_str { + const unsigned short row; + const unsigned short col; + _tile1024i tile; +} __tile1024i; + +/// Load tile rows from memory specifieid by "base" address and "stride" into +/// destination tile "dst". +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TILELOADD instruction. +/// +/// \param dst +/// A destination tile. Max size is 1024 Bytes. +/// \param base +/// A pointer to base address. +/// \param stride +/// The stride between the rows' data to be loaded in memory. +__DEFAULT_FN_ATTRS_TILE +static __inline__ void __tile_loadd(__tile1024i *dst, const void *base, + __SIZE_TYPE__ stride) { + dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride); +} + +/// Load tile rows from memory specifieid by "base" address and "stride" into +/// destination tile "dst". This intrinsic provides a hint to the implementation +/// that the data will likely not be reused in the near future and the data +/// caching can be optimized accordingly. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TILELOADDT1 instruction. +/// +/// \param dst +/// A destination tile. Max size is 1024 Bytes. +/// \param base +/// A pointer to base address. +/// \param stride +/// The stride between the rows' data to be loaded in memory. +__DEFAULT_FN_ATTRS_TILE +static __inline__ void __tile_stream_loadd(__tile1024i *dst, const void *base, + __SIZE_TYPE__ stride) { + dst->tile = _tile_loaddt1_internal(dst->row, dst->col, base, stride); +} + +/// Compute dot-product of bytes in tiles with a source/destination accumulator. +/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with +/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit +/// results. Sum these 4 results with the corresponding 32-bit integer in "dst", +/// and store the 32-bit result back to tile "dst". +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TDPBSSD instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param src0 +/// The 1st source tile. Max size is 1024 Bytes. +/// \param src1 +/// The 2nd source tile. Max size is 1024 Bytes. +__DEFAULT_FN_ATTRS_INT8 +static __inline__ void __tile_dpbssd(__tile1024i *dst, __tile1024i src0, + __tile1024i src1) { + dst->tile = _tile_dpbssd_internal(src0.row, src1.col, src0.col, dst->tile, + src0.tile, src1.tile); +} + +/// Compute dot-product of bytes in tiles with a source/destination accumulator. +/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with +/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate +/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer +/// in "dst", and store the 32-bit result back to tile "dst". +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TDPBSUD instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param src0 +/// The 1st source tile. Max size is 1024 Bytes. +/// \param src1 +/// The 2nd source tile. Max size is 1024 Bytes. +__DEFAULT_FN_ATTRS_INT8 +static __inline__ void __tile_dpbsud(__tile1024i *dst, __tile1024i src0, + __tile1024i src1) { + dst->tile = _tile_dpbsud_internal(src0.row, src1.col, src0.col, dst->tile, + src0.tile, src1.tile); +} + +/// Compute dot-product of bytes in tiles with a source/destination accumulator. +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with +/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit +/// results. Sum these 4 results with the corresponding 32-bit integer in "dst", +/// and store the 32-bit result back to tile "dst". +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TDPBUSD instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param src0 +/// The 1st source tile. Max size is 1024 Bytes. +/// \param src1 +/// The 2nd source tile. Max size is 1024 Bytes. +__DEFAULT_FN_ATTRS_INT8 +static __inline__ void __tile_dpbusd(__tile1024i *dst, __tile1024i src0, + __tile1024i src1) { + dst->tile = _tile_dpbusd_internal(src0.row, src1.col, src0.col, dst->tile, + src0.tile, src1.tile); +} + +/// Compute dot-product of bytes in tiles with a source/destination accumulator. +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with +/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate +/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in +/// "dst", and store the 32-bit result back to tile "dst". +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TDPBUUD instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param src0 +/// The 1st source tile. Max size is 1024 Bytes. +/// \param src1 +/// The 2nd source tile. Max size is 1024 Bytes. +__DEFAULT_FN_ATTRS_INT8 +static __inline__ void __tile_dpbuud(__tile1024i *dst, __tile1024i src0, + __tile1024i src1) { + dst->tile = _tile_dpbuud_internal(src0.row, src1.col, src0.col, dst->tile, + src0.tile, src1.tile); +} + +/// Store the tile specified by "src" to memory specifieid by "base" address and +/// "stride". +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TILESTORED instruction. +/// +/// \param dst +/// A destination tile. Max size is 1024 Bytes. +/// \param base +/// A pointer to base address. +/// \param stride +/// The stride between the rows' data to be stored in memory. +__DEFAULT_FN_ATTRS_TILE +static __inline__ void __tile_stored(void *base, __SIZE_TYPE__ stride, + __tile1024i src) { + _tile_stored_internal(src.row, src.col, base, stride, src.tile); +} + +/// Zero the tile specified by "dst". +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TILEZERO instruction. +/// +/// \param dst +/// The destination tile to be zero. Max size is 1024 Bytes. +__DEFAULT_FN_ATTRS_TILE +static __inline__ void __tile_zero(__tile1024i *dst) { + dst->tile = __builtin_ia32_tilezero_internal(dst->row, dst->col); +} + +/// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and +/// src1, accumulating the intermediate single-precision (32-bit) floating-point +/// elements with elements in "dst", and store the 32-bit result back to tile +/// "dst". +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TDPBF16PS instruction. +/// +/// \param dst +/// The destination tile. Max size is 1024 Bytes. +/// \param src0 +/// The 1st source tile. Max size is 1024 Bytes. +/// \param src1 +/// The 2nd source tile. Max size is 1024 Bytes. +__DEFAULT_FN_ATTRS_BF16 +static __inline__ void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src0, + __tile1024i src1) { + dst->tile = _tile_dpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile, + src0.tile, src1.tile); +} + +#undef __DEFAULT_FN_ATTRS_TILE +#undef __DEFAULT_FN_ATTRS_INT8 +#undef __DEFAULT_FN_ATTRS_BF16 + +#endif /* __x86_64__ */ +#endif /* __AMXINTRIN_H */ diff --git a/include-llvm/avx2intrin.h b/include-llvm/avx2intrin.h new file mode 100644 index 0000000..38367a3 --- /dev/null +++ b/include-llvm/avx2intrin.h @@ -0,0 +1,1240 @@ +/*===---- avx2intrin.h - AVX2 intrinsics -----------------------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __AVX2INTRIN_H +#define __AVX2INTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx2"), __min_vector_width__(256))) +#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx2"), __min_vector_width__(128))) + +/* SSE4 Multiple Packed Sums of Absolute Difference. */ +#define _mm256_mpsadbw_epu8(X, Y, M) \ + ((__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \ + (__v32qi)(__m256i)(Y), (int)(M))) + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_abs_epi8(__m256i __a) +{ +#if (__clang_major__ < 14) + return (__m256i)__builtin_ia32_pabsb256((__v32qi)__a); +#else + return (__m256i)__builtin_elementwise_abs((__v32qs)__a); +#endif +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_abs_epi16(__m256i __a) +{ +#if (__clang_major__ < 14) + return (__m256i)__builtin_ia32_pabsw256((__v16hi)__a); +#else + return (__m256i)__builtin_elementwise_abs((__v16hi)__a); +#endif +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_abs_epi32(__m256i __a) +{ +#if (__clang_major__ < 14) + return (__m256i)__builtin_ia32_pabsd256((__v8si)__a); +#else + return (__m256i)__builtin_elementwise_abs((__v8si)__a); +#endif +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_packs_epi16(__m256i __a, __m256i __b) +{ + return (__m256i)__builtin_ia32_packsswb256((__v16hi)__a, (__v16hi)__b); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_packs_epi32(__m256i __a, __m256i __b) +{ + return (__m256i)__builtin_ia32_packssdw256((__v8si)__a, (__v8si)__b); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_packus_epi16(__m256i __a, __m256i __b) +{ + return (__m256i)__builtin_ia32_packuswb256((__v16hi)__a, (__v16hi)__b); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_packus_epi32(__m256i __V1, __m256i __V2) +{ + return (__m256i) __builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_add_epi8(__m256i __a, __m256i __b) +{ + return (__m256i)((__v32qu)__a + (__v32qu)__b); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_add_epi16(__m256i __a, __m256i __b) +{ + return (__m256i)((__v16hu)__a + (__v16hu)__b); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_add_epi32(__m256i __a, __m256i __b) +{ + return (__m256i)((__v8su)__a + (__v8su)__b); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_add_epi64(__m256i __a, __m256i __b) +{ + return (__m256i)((__v4du)__a + (__v4du)__b); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_adds_epi8(__m256i __a, __m256i __b) +{ +#if (__clang_major__ > 14) + return (__m256i)__builtin_elementwise_add_sat((__v32qs)__a, (__v32qs)__b); +#else + return (__m256i)__builtin_ia32_paddsb256((__v32qi)__a, (__v32qi)__b); +#endif +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_adds_epi16(__m256i __a, __m256i __b) +{ +#if (__clang_major__ > 14) + return (__m256i)__builtin_elementwise_add_sat((__v16hi)__a, (__v16hi)__b); +#else + return (__m256i)__builtin_ia32_paddsw256((__v16hi)__a, (__v16hi)__b); +#endif +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_adds_epu8(__m256i __a, __m256i __b) +{ +#if (__clang_major__ > 14) + return (__m256i)__builtin_elementwise_add_sat((__v32qu)__a, (__v32qu)__b); +#else + return (__m256i)__builtin_ia32_paddusb256((__v32qi)__a, (__v32qi)__b); +#endif +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_adds_epu16(__m256i __a, __m256i __b) +{ +#if (__clang_major__ > 14) + return (__m256i)__builtin_elementwise_add_sat((__v16hu)__a, (__v16hu)__b); +#else + return (__m256i)__builtin_ia32_paddusw256((__v16hi)__a, (__v16hi)__b); +#endif +} + +#define _mm256_alignr_epi8(a, b, n) \ + ((__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \ + (__v32qi)(__m256i)(b), (n))) + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_and_si256(__m256i __a, __m256i __b) +{ + return (__m256i)((__v4du)__a & (__v4du)__b); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_andnot_si256(__m256i __a, __m256i __b) +{ + return (__m256i)(~(__v4du)__a & (__v4du)__b); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_avg_epu8(__m256i __a, __m256i __b) +{ + return (__m256i)__builtin_ia32_pavgb256((__v32qi)__a, (__v32qi)__b); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_avg_epu16(__m256i __a, __m256i __b) +{ + return (__m256i)__builtin_ia32_pavgw256((__v16hi)__a, (__v16hi)__b); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M) +{ + return (__m256i)__builtin_ia32_pblendvb256((__v32qi)__V1, (__v32qi)__V2, + (__v32qi)__M); +} + +#define _mm256_blend_epi16(V1, V2, M) \ + ((__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \ + (__v16hi)(__m256i)(V2), (int)(M))) + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cmpeq_epi8(__m256i __a, __m256i __b) +{ + return (__m256i)((__v32qi)__a == (__v32qi)__b); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cmpeq_epi16(__m256i __a, __m256i __b) +{ + return (__m256i)((__v16hi)__a == (__v16hi)__b); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cmpeq_epi32(__m256i __a, __m256i __b) +{ + return (__m256i)((__v8si)__a == (__v8si)__b); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cmpeq_epi64(__m256i __a, __m256i __b) +{ + return (__m256i)((__v4di)__a == (__v4di)__b); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cmpgt_epi8(__m256i __a, __m256i __b) +{ + /* This function always performs a signed comparison, but __v32qi is a char + which may be signed or unsigned, so use __v32qs. */ + return (__m256i)((__v32qs)__a > (__v32qs)__b); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cmpgt_epi16(__m256i __a, __m256i __b) +{ + return (__m256i)((__v16hi)__a > (__v16hi)__b); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cmpgt_epi32(__m256i __a, __m256i __b) +{ + return (__m256i)((__v8si)__a > (__v8si)__b); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cmpgt_epi64(__m256i __a, __m256i __b) +{ + return (__m256i)((__v4di)__a > (__v4di)__b); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_hadd_epi16(__m256i __a, __m256i __b) +{ + return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_hadd_epi32(__m256i __a, __m256i __b) +{ + return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_hadds_epi16(__m256i __a, __m256i __b) +{ + return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_hsub_epi16(__m256i __a, __m256i __b) +{ + return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_hsub_epi32(__m256i __a, __m256i __b) +{ + return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_hsubs_epi16(__m256i __a, __m256i __b) +{ + return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maddubs_epi16(__m256i __a, __m256i __b) +{ + return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_madd_epi16(__m256i __a, __m256i __b) +{ + return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__a, (__v16hi)__b); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_max_epi8(__m256i __a, __m256i __b) +{ +#if (__clang_major__ < 14) + return (__m256i)__builtin_ia32_pmaxsb256((__v32qi)__a, (__v32qi)__b); +#else + return (__m256i)__builtin_elementwise_max((__v32qs)__a, (__v32qs)__b); +#endif +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_max_epi16(__m256i __a, __m256i __b) +{ +#if (__clang_major__ < 14) + return (__m256i)__builtin_ia32_pmaxsw256((__v16hi)__a, (__v16hi)__b); +#else + return (__m256i)__builtin_elementwise_max((__v16hi)__a, (__v16hi)__b); +#endif +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_max_epi32(__m256i __a, __m256i __b) +{ +#if (__clang_major__ < 14) + return (__m256i)__builtin_ia32_pmaxsd256((__v8si)__a, (__v8si)__b); +#else + return (__m256i)__builtin_elementwise_max((__v8si)__a, (__v8si)__b); +#endif +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_max_epu8(__m256i __a, __m256i __b) +{ +#if (__clang_major__ < 14) + return (__m256i)__builtin_ia32_pmaxub256((__v32qi)__a, (__v32qi)__b); +#else + return (__m256i)__builtin_elementwise_max((__v32qu)__a, (__v32qu)__b); +#endif +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_max_epu16(__m256i __a, __m256i __b) +{ +#if (__clang_major__ < 14) + return (__m256i)__builtin_ia32_pmaxuw256((__v16hi)__a, (__v16hi)__b); +#else + return (__m256i)__builtin_elementwise_max((__v16hu)__a, (__v16hu)__b); +#endif +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_max_epu32(__m256i __a, __m256i __b) +{ +#if (__clang_major__ < 14) + return (__m256i)__builtin_ia32_pmaxud256((__v8si)__a, (__v8si)__b); +#else + return (__m256i)__builtin_elementwise_max((__v8su)__a, (__v8su)__b); +#endif +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_min_epi8(__m256i __a, __m256i __b) +{ +#if (__clang_major__ < 14) + return (__m256i)__builtin_ia32_pminsb256((__v32qi)__a, (__v32qi)__b); +#else + return (__m256i)__builtin_elementwise_min((__v32qs)__a, (__v32qs)__b); +#endif +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_min_epi16(__m256i __a, __m256i __b) +{ +#if (__clang_major__ < 14) + return (__m256i)__builtin_ia32_pminsw256((__v16hi)__a, (__v16hi)__b); +#else + return (__m256i)__builtin_elementwise_min((__v16hi)__a, (__v16hi)__b); +#endif +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_min_epi32(__m256i __a, __m256i __b) +{ +#if (__clang_major__ < 14) + return (__m256i)__builtin_ia32_pminsd256((__v8si)__a, (__v8si)__b); +#else + return (__m256i)__builtin_elementwise_min((__v8si)__a, (__v8si)__b); +#endif +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_min_epu8(__m256i __a, __m256i __b) +{ +#if (__clang_major__ < 14) + return (__m256i)__builtin_ia32_pminub256((__v32qi)__a, (__v32qi)__b); +#else + return (__m256i)__builtin_elementwise_min((__v32qu)__a, (__v32qu)__b); +#endif +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_min_epu16(__m256i __a, __m256i __b) +{ +#if (__clang_major__ < 14) + return (__m256i)__builtin_ia32_pminuw256 ((__v16hi)__a, (__v16hi)__b); +#else + return (__m256i)__builtin_elementwise_min((__v16hu)__a, (__v16hu)__b); +#endif +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_min_epu32(__m256i __a, __m256i __b) +{ +#if (__clang_major__ < 14) + return (__m256i)__builtin_ia32_pminud256((__v8si)__a, (__v8si)__b); +#else + return (__m256i)__builtin_elementwise_min((__v8su)__a, (__v8su)__b); +#endif +} + +static __inline__ int __DEFAULT_FN_ATTRS256 +_mm256_movemask_epi8(__m256i __a) +{ + return __builtin_ia32_pmovmskb256((__v32qi)__a); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cvtepi8_epi16(__m128i __V) +{ + /* This function always performs a signed extension, but __v16qi is a char + which may be signed or unsigned, so use __v16qs. */ + return (__m256i)__builtin_convertvector((__v16qs)__V, __v16hi); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cvtepi8_epi32(__m128i __V) +{ + /* This function always performs a signed extension, but __v16qi is a char + which may be signed or unsigned, so use __v16qs. */ + return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cvtepi8_epi64(__m128i __V) +{ + /* This function always performs a signed extension, but __v16qi is a char + which may be signed or unsigned, so use __v16qs. */ + return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4di); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cvtepi16_epi32(__m128i __V) +{ + return (__m256i)__builtin_convertvector((__v8hi)__V, __v8si); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cvtepi16_epi64(__m128i __V) +{ + return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4di); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cvtepi32_epi64(__m128i __V) +{ + return (__m256i)__builtin_convertvector((__v4si)__V, __v4di); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cvtepu8_epi16(__m128i __V) +{ + return (__m256i)__builtin_convertvector((__v16qu)__V, __v16hi); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cvtepu8_epi32(__m128i __V) +{ + return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cvtepu8_epi64(__m128i __V) +{ + return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4di); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cvtepu16_epi32(__m128i __V) +{ + return (__m256i)__builtin_convertvector((__v8hu)__V, __v8si); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cvtepu16_epi64(__m128i __V) +{ + return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4di); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cvtepu32_epi64(__m128i __V) +{ + return (__m256i)__builtin_convertvector((__v4su)__V, __v4di); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mul_epi32(__m256i __a, __m256i __b) +{ + return (__m256i)__builtin_ia32_pmuldq256((__v8si)__a, (__v8si)__b); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mulhrs_epi16(__m256i __a, __m256i __b) +{ + return (__m256i)__builtin_ia32_pmulhrsw256((__v16hi)__a, (__v16hi)__b); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mulhi_epu16(__m256i __a, __m256i __b) +{ + return (__m256i)__builtin_ia32_pmulhuw256((__v16hi)__a, (__v16hi)__b); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mulhi_epi16(__m256i __a, __m256i __b) +{ + return (__m256i)__builtin_ia32_pmulhw256((__v16hi)__a, (__v16hi)__b); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mullo_epi16(__m256i __a, __m256i __b) +{ + return (__m256i)((__v16hu)__a * (__v16hu)__b); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mullo_epi32 (__m256i __a, __m256i __b) +{ + return (__m256i)((__v8su)__a * (__v8su)__b); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mul_epu32(__m256i __a, __m256i __b) +{ + return __builtin_ia32_pmuludq256((__v8si)__a, (__v8si)__b); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_or_si256(__m256i __a, __m256i __b) +{ + return (__m256i)((__v4du)__a | (__v4du)__b); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_sad_epu8(__m256i __a, __m256i __b) +{ + return __builtin_ia32_psadbw256((__v32qi)__a, (__v32qi)__b); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_shuffle_epi8(__m256i __a, __m256i __b) +{ + return (__m256i)__builtin_ia32_pshufb256((__v32qi)__a, (__v32qi)__b); +} + +#define _mm256_shuffle_epi32(a, imm) \ + ((__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(a), (int)(imm))) + +#define _mm256_shufflehi_epi16(a, imm) \ + ((__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(a), (int)(imm))) + +#define _mm256_shufflelo_epi16(a, imm) \ + ((__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm))) + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_sign_epi8(__m256i __a, __m256i __b) +{ + return (__m256i)__builtin_ia32_psignb256((__v32qi)__a, (__v32qi)__b); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_sign_epi16(__m256i __a, __m256i __b) +{ + return (__m256i)__builtin_ia32_psignw256((__v16hi)__a, (__v16hi)__b); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_sign_epi32(__m256i __a, __m256i __b) +{ + return (__m256i)__builtin_ia32_psignd256((__v8si)__a, (__v8si)__b); +} + +#define _mm256_slli_si256(a, imm) \ + ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm))) + +#define _mm256_bslli_epi128(a, imm) \ + ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm))) + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_slli_epi16(__m256i __a, int __count) +{ + return (__m256i)__builtin_ia32_psllwi256((__v16hi)__a, __count); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_sll_epi16(__m256i __a, __m128i __count) +{ + return (__m256i)__builtin_ia32_psllw256((__v16hi)__a, (__v8hi)__count); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_slli_epi32(__m256i __a, int __count) +{ + return (__m256i)__builtin_ia32_pslldi256((__v8si)__a, __count); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_sll_epi32(__m256i __a, __m128i __count) +{ + return (__m256i)__builtin_ia32_pslld256((__v8si)__a, (__v4si)__count); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_slli_epi64(__m256i __a, int __count) +{ + return __builtin_ia32_psllqi256((__v4di)__a, __count); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_sll_epi64(__m256i __a, __m128i __count) +{ + return __builtin_ia32_psllq256((__v4di)__a, __count); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_srai_epi16(__m256i __a, int __count) +{ + return (__m256i)__builtin_ia32_psrawi256((__v16hi)__a, __count); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_sra_epi16(__m256i __a, __m128i __count) +{ + return (__m256i)__builtin_ia32_psraw256((__v16hi)__a, (__v8hi)__count); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_srai_epi32(__m256i __a, int __count) +{ + return (__m256i)__builtin_ia32_psradi256((__v8si)__a, __count); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_sra_epi32(__m256i __a, __m128i __count) +{ + return (__m256i)__builtin_ia32_psrad256((__v8si)__a, (__v4si)__count); +} + +#define _mm256_srli_si256(a, imm) \ + ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm))) + +#define _mm256_bsrli_epi128(a, imm) \ + ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm))) + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_srli_epi16(__m256i __a, int __count) +{ + return (__m256i)__builtin_ia32_psrlwi256((__v16hi)__a, __count); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_srl_epi16(__m256i __a, __m128i __count) +{ + return (__m256i)__builtin_ia32_psrlw256((__v16hi)__a, (__v8hi)__count); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_srli_epi32(__m256i __a, int __count) +{ + return (__m256i)__builtin_ia32_psrldi256((__v8si)__a, __count); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_srl_epi32(__m256i __a, __m128i __count) +{ + return (__m256i)__builtin_ia32_psrld256((__v8si)__a, (__v4si)__count); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_srli_epi64(__m256i __a, int __count) +{ + return __builtin_ia32_psrlqi256((__v4di)__a, __count); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_srl_epi64(__m256i __a, __m128i __count) +{ + return __builtin_ia32_psrlq256((__v4di)__a, __count); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_sub_epi8(__m256i __a, __m256i __b) +{ + return (__m256i)((__v32qu)__a - (__v32qu)__b); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_sub_epi16(__m256i __a, __m256i __b) +{ + return (__m256i)((__v16hu)__a - (__v16hu)__b); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_sub_epi32(__m256i __a, __m256i __b) +{ + return (__m256i)((__v8su)__a - (__v8su)__b); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_sub_epi64(__m256i __a, __m256i __b) +{ + return (__m256i)((__v4du)__a - (__v4du)__b); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_subs_epi8(__m256i __a, __m256i __b) +{ +#if (__clang_major__ > 14) + return (__m256i)__builtin_elementwise_sub_sat((__v32qs)__a, (__v32qs)__b); +#else + return (__m256i)__builtin_ia32_psubsb256((__v32qi)__a, (__v32qi)__b); +#endif +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_subs_epi16(__m256i __a, __m256i __b) +{ +#if (__clang_major__ > 14) + return (__m256i)__builtin_elementwise_sub_sat((__v16hi)__a, (__v16hi)__b); +#else + return (__m256i)__builtin_ia32_psubsw256((__v16hi)__a, (__v16hi)__b); +#endif +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_subs_epu8(__m256i __a, __m256i __b) +{ +#if (__clang_major__ > 14) + return (__m256i)__builtin_elementwise_sub_sat((__v32qu)__a, (__v32qu)__b); +#else + return (__m256i)__builtin_ia32_psubusb256((__v32qi)__a, (__v32qi)__b); +#endif +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_subs_epu16(__m256i __a, __m256i __b) +{ +#if (__clang_major__ > 14) + return (__m256i)__builtin_elementwise_sub_sat((__v16hu)__a, (__v16hu)__b); +#else + return (__m256i)__builtin_ia32_psubusw256((__v16hi)__a, (__v16hi)__b); +#endif +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_unpackhi_epi8(__m256i __a, __m256i __b) +{ + return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 8, 32+8, 9, 32+9, 10, 32+10, 11, 32+11, 12, 32+12, 13, 32+13, 14, 32+14, 15, 32+15, 24, 32+24, 25, 32+25, 26, 32+26, 27, 32+27, 28, 32+28, 29, 32+29, 30, 32+30, 31, 32+31); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_unpackhi_epi16(__m256i __a, __m256i __b) +{ + return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_unpackhi_epi32(__m256i __a, __m256i __b) +{ + return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 2, 8+2, 3, 8+3, 6, 8+6, 7, 8+7); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_unpackhi_epi64(__m256i __a, __m256i __b) +{ + return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 1, 4+1, 3, 4+3); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_unpacklo_epi8(__m256i __a, __m256i __b) +{ + return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 0, 32+0, 1, 32+1, 2, 32+2, 3, 32+3, 4, 32+4, 5, 32+5, 6, 32+6, 7, 32+7, 16, 32+16, 17, 32+17, 18, 32+18, 19, 32+19, 20, 32+20, 21, 32+21, 22, 32+22, 23, 32+23); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_unpacklo_epi16(__m256i __a, __m256i __b) +{ + return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_unpacklo_epi32(__m256i __a, __m256i __b) +{ + return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 0, 8+0, 1, 8+1, 4, 8+4, 5, 8+5); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_unpacklo_epi64(__m256i __a, __m256i __b) +{ + return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 0, 4+0, 2, 4+2); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_xor_si256(__m256i __a, __m256i __b) +{ + return (__m256i)((__v4du)__a ^ (__v4du)__b); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_stream_load_si256(__m256i const *__V) +{ + typedef __v4di __v4di_aligned __attribute__((aligned(32))); + return (__m256i)__builtin_nontemporal_load((const __v4di_aligned *)__V); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_broadcastss_ps(__m128 __X) +{ + return (__m128)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_broadcastsd_pd(__m128d __a) +{ + return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_broadcastss_ps(__m128 __X) +{ + return (__m256)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0, 0, 0, 0, 0); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_broadcastsd_pd(__m128d __X) +{ + return (__m256d)__builtin_shufflevector((__v2df)__X, (__v2df)__X, 0, 0, 0, 0); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_broadcastsi128_si256(__m128i __X) +{ + return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 1, 0, 1); +} + +#define _mm_broadcastsi128_si256(X) _mm256_broadcastsi128_si256(X) + +#define _mm_blend_epi32(V1, V2, M) \ + ((__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \ + (__v4si)(__m128i)(V2), (int)(M))) + +#define _mm256_blend_epi32(V1, V2, M) \ + ((__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \ + (__v8si)(__m256i)(V2), (int)(M))) + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_broadcastb_epi8(__m128i __X) +{ + return (__m256i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_broadcastw_epi16(__m128i __X) +{ + return (__m256i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_broadcastd_epi32(__m128i __X) +{ + return (__m256i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0, 0, 0, 0, 0); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_broadcastq_epi64(__m128i __X) +{ + return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0, 0, 0); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_broadcastb_epi8(__m128i __X) +{ + return (__m128i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_broadcastw_epi16(__m128i __X) +{ + return (__m128i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0); +} + + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_broadcastd_epi32(__m128i __X) +{ + return (__m128i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_broadcastq_epi64(__m128i __X) +{ + return (__m128i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_permutevar8x32_epi32(__m256i __a, __m256i __b) +{ + return (__m256i)__builtin_ia32_permvarsi256((__v8si)__a, (__v8si)__b); +} + +#define _mm256_permute4x64_pd(V, M) \ + ((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M))) + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_permutevar8x32_ps(__m256 __a, __m256i __b) +{ + return (__m256)__builtin_ia32_permvarsf256((__v8sf)__a, (__v8si)__b); +} + +#define _mm256_permute4x64_epi64(V, M) \ + ((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M))) + +#define _mm256_permute2x128_si256(V1, V2, M) \ + ((__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (int)(M))) + +#define _mm256_extracti128_si256(V, M) \ + ((__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M))) + +#define _mm256_inserti128_si256(V1, V2, M) \ + ((__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1), \ + (__v2di)(__m128i)(V2), (int)(M))) + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskload_epi32(int const *__X, __m256i __M) +{ + return (__m256i)__builtin_ia32_maskloadd256((const __v8si *)__X, (__v8si)__M); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskload_epi64(long long const *__X, __m256i __M) +{ + return (__m256i)__builtin_ia32_maskloadq256((const __v4di *)__X, (__v4di)__M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskload_epi32(int const *__X, __m128i __M) +{ + return (__m128i)__builtin_ia32_maskloadd((const __v4si *)__X, (__v4si)__M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskload_epi64(long long const *__X, __m128i __M) +{ + return (__m128i)__builtin_ia32_maskloadq((const __v2di *)__X, (__v2di)__M); +} + +static __inline__ void __DEFAULT_FN_ATTRS256 +_mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y) +{ + __builtin_ia32_maskstored256((__v8si *)__X, (__v8si)__M, (__v8si)__Y); +} + +static __inline__ void __DEFAULT_FN_ATTRS256 +_mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y) +{ + __builtin_ia32_maskstoreq256((__v4di *)__X, (__v4di)__M, (__v4di)__Y); +} + +static __inline__ void __DEFAULT_FN_ATTRS128 +_mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y) +{ + __builtin_ia32_maskstored((__v4si *)__X, (__v4si)__M, (__v4si)__Y); +} + +static __inline__ void __DEFAULT_FN_ATTRS128 +_mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y) +{ + __builtin_ia32_maskstoreq(( __v2di *)__X, (__v2di)__M, (__v2di)__Y); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_sllv_epi32(__m256i __X, __m256i __Y) +{ + return (__m256i)__builtin_ia32_psllv8si((__v8si)__X, (__v8si)__Y); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_sllv_epi32(__m128i __X, __m128i __Y) +{ + return (__m128i)__builtin_ia32_psllv4si((__v4si)__X, (__v4si)__Y); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_sllv_epi64(__m256i __X, __m256i __Y) +{ + return (__m256i)__builtin_ia32_psllv4di((__v4di)__X, (__v4di)__Y); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_sllv_epi64(__m128i __X, __m128i __Y) +{ + return (__m128i)__builtin_ia32_psllv2di((__v2di)__X, (__v2di)__Y); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_srav_epi32(__m256i __X, __m256i __Y) +{ + return (__m256i)__builtin_ia32_psrav8si((__v8si)__X, (__v8si)__Y); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_srav_epi32(__m128i __X, __m128i __Y) +{ + return (__m128i)__builtin_ia32_psrav4si((__v4si)__X, (__v4si)__Y); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_srlv_epi32(__m256i __X, __m256i __Y) +{ + return (__m256i)__builtin_ia32_psrlv8si((__v8si)__X, (__v8si)__Y); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_srlv_epi32(__m128i __X, __m128i __Y) +{ + return (__m128i)__builtin_ia32_psrlv4si((__v4si)__X, (__v4si)__Y); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_srlv_epi64(__m256i __X, __m256i __Y) +{ + return (__m256i)__builtin_ia32_psrlv4di((__v4di)__X, (__v4di)__Y); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_srlv_epi64(__m128i __X, __m128i __Y) +{ + return (__m128i)__builtin_ia32_psrlv2di((__v2di)__X, (__v2di)__Y); +} + +#define _mm_mask_i32gather_pd(a, m, i, mask, s) \ + ((__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \ + (double const *)(m), \ + (__v4si)(__m128i)(i), \ + (__v2df)(__m128d)(mask), (s))) + +#define _mm256_mask_i32gather_pd(a, m, i, mask, s) \ + ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \ + (double const *)(m), \ + (__v4si)(__m128i)(i), \ + (__v4df)(__m256d)(mask), (s))) + +#define _mm_mask_i64gather_pd(a, m, i, mask, s) \ + ((__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \ + (double const *)(m), \ + (__v2di)(__m128i)(i), \ + (__v2df)(__m128d)(mask), (s))) + +#define _mm256_mask_i64gather_pd(a, m, i, mask, s) \ + ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \ + (double const *)(m), \ + (__v4di)(__m256i)(i), \ + (__v4df)(__m256d)(mask), (s))) + +#define _mm_mask_i32gather_ps(a, m, i, mask, s) \ + ((__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \ + (float const *)(m), \ + (__v4si)(__m128i)(i), \ + (__v4sf)(__m128)(mask), (s))) + +#define _mm256_mask_i32gather_ps(a, m, i, mask, s) \ + ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \ + (float const *)(m), \ + (__v8si)(__m256i)(i), \ + (__v8sf)(__m256)(mask), (s))) + +#define _mm_mask_i64gather_ps(a, m, i, mask, s) \ + ((__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \ + (float const *)(m), \ + (__v2di)(__m128i)(i), \ + (__v4sf)(__m128)(mask), (s))) + +#define _mm256_mask_i64gather_ps(a, m, i, mask, s) \ + ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \ + (float const *)(m), \ + (__v4di)(__m256i)(i), \ + (__v4sf)(__m128)(mask), (s))) + +#define _mm_mask_i32gather_epi32(a, m, i, mask, s) \ + ((__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \ + (int const *)(m), \ + (__v4si)(__m128i)(i), \ + (__v4si)(__m128i)(mask), (s))) + +#define _mm256_mask_i32gather_epi32(a, m, i, mask, s) \ + ((__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \ + (int const *)(m), \ + (__v8si)(__m256i)(i), \ + (__v8si)(__m256i)(mask), (s))) + +#define _mm_mask_i64gather_epi32(a, m, i, mask, s) \ + ((__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \ + (int const *)(m), \ + (__v2di)(__m128i)(i), \ + (__v4si)(__m128i)(mask), (s))) + +#define _mm256_mask_i64gather_epi32(a, m, i, mask, s) \ + ((__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \ + (int const *)(m), \ + (__v4di)(__m256i)(i), \ + (__v4si)(__m128i)(mask), (s))) + +#define _mm_mask_i32gather_epi64(a, m, i, mask, s) \ + ((__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \ + (long long const *)(m), \ + (__v4si)(__m128i)(i), \ + (__v2di)(__m128i)(mask), (s))) + +#define _mm256_mask_i32gather_epi64(a, m, i, mask, s) \ + ((__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \ + (long long const *)(m), \ + (__v4si)(__m128i)(i), \ + (__v4di)(__m256i)(mask), (s))) + +#define _mm_mask_i64gather_epi64(a, m, i, mask, s) \ + ((__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \ + (long long const *)(m), \ + (__v2di)(__m128i)(i), \ + (__v2di)(__m128i)(mask), (s))) + +#define _mm256_mask_i64gather_epi64(a, m, i, mask, s) \ + ((__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \ + (long long const *)(m), \ + (__v4di)(__m256i)(i), \ + (__v4di)(__m256i)(mask), (s))) + +#define _mm_i32gather_pd(m, i, s) \ + ((__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \ + (double const *)(m), \ + (__v4si)(__m128i)(i), \ + (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \ + _mm_setzero_pd()), \ + (s))) + +#define _mm256_i32gather_pd(m, i, s) \ + ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \ + (double const *)(m), \ + (__v4si)(__m128i)(i), \ + (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \ + _mm256_setzero_pd(), \ + _CMP_EQ_OQ), \ + (s))) + +#define _mm_i64gather_pd(m, i, s) \ + ((__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \ + (double const *)(m), \ + (__v2di)(__m128i)(i), \ + (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \ + _mm_setzero_pd()), \ + (s))) + +#define _mm256_i64gather_pd(m, i, s) \ + ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \ + (double const *)(m), \ + (__v4di)(__m256i)(i), \ + (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \ + _mm256_setzero_pd(), \ + _CMP_EQ_OQ), \ + (s))) + +#define _mm_i32gather_ps(m, i, s) \ + ((__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \ + (float const *)(m), \ + (__v4si)(__m128i)(i), \ + (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \ + _mm_setzero_ps()), \ + (s))) + +#define _mm256_i32gather_ps(m, i, s) \ + ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \ + (float const *)(m), \ + (__v8si)(__m256i)(i), \ + (__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \ + _mm256_setzero_ps(), \ + _CMP_EQ_OQ), \ + (s))) + +#define _mm_i64gather_ps(m, i, s) \ + ((__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \ + (float const *)(m), \ + (__v2di)(__m128i)(i), \ + (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \ + _mm_setzero_ps()), \ + (s))) + +#define _mm256_i64gather_ps(m, i, s) \ + ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \ + (float const *)(m), \ + (__v4di)(__m256i)(i), \ + (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \ + _mm_setzero_ps()), \ + (s))) + +#define _mm_i32gather_epi32(m, i, s) \ + ((__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \ + (int const *)(m), (__v4si)(__m128i)(i), \ + (__v4si)_mm_set1_epi32(-1), (s))) + +#define _mm256_i32gather_epi32(m, i, s) \ + ((__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \ + (int const *)(m), (__v8si)(__m256i)(i), \ + (__v8si)_mm256_set1_epi32(-1), (s))) + +#define _mm_i64gather_epi32(m, i, s) \ + ((__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \ + (int const *)(m), (__v2di)(__m128i)(i), \ + (__v4si)_mm_set1_epi32(-1), (s))) + +#define _mm256_i64gather_epi32(m, i, s) \ + ((__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \ + (int const *)(m), (__v4di)(__m256i)(i), \ + (__v4si)_mm_set1_epi32(-1), (s))) + +#define _mm_i32gather_epi64(m, i, s) \ + ((__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \ + (long long const *)(m), \ + (__v4si)(__m128i)(i), \ + (__v2di)_mm_set1_epi64x(-1), (s))) + +#define _mm256_i32gather_epi64(m, i, s) \ + ((__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \ + (long long const *)(m), \ + (__v4si)(__m128i)(i), \ + (__v4di)_mm256_set1_epi64x(-1), (s))) + +#define _mm_i64gather_epi64(m, i, s) \ + ((__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \ + (long long const *)(m), \ + (__v2di)(__m128i)(i), \ + (__v2di)_mm_set1_epi64x(-1), (s))) + +#define _mm256_i64gather_epi64(m, i, s) \ + ((__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \ + (long long const *)(m), \ + (__v4di)(__m256i)(i), \ + (__v4di)_mm256_set1_epi64x(-1), (s))) + +#undef __DEFAULT_FN_ATTRS256 +#undef __DEFAULT_FN_ATTRS128 + +#endif /* __AVX2INTRIN_H */ diff --git a/include-llvm/avx512bf16intrin.h b/include-llvm/avx512bf16intrin.h new file mode 100644 index 0000000..eef0fc3 --- /dev/null +++ b/include-llvm/avx512bf16intrin.h @@ -0,0 +1,285 @@ +/*===------------ avx512bf16intrin.h - AVX512_BF16 intrinsics --------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __AVX512BF16INTRIN_H +#define __AVX512BF16INTRIN_H + +#if (__clang_major__ > 15) +typedef __bf16 __v32bf __attribute__((__vector_size__(64), __aligned__(64))); +typedef __bf16 __m512bh __attribute__((__vector_size__(64), __aligned__(64))); +typedef __bf16 __bfloat16; +#else +typedef short __m512bh __attribute__((__vector_size__(64), __aligned__(64))); +typedef short __m256bh __attribute__((__vector_size__(32), __aligned__(32))); +typedef unsigned short __bfloat16; +#endif + +#define __DEFAULT_FN_ATTRS512 \ + __attribute__((__always_inline__, __nodebug__, __target__("avx512bf16"), \ + __min_vector_width__(512))) +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("avx512bf16"))) + +/// Convert One BF16 Data to One Single Float Data. +/// +/// \headerfile +/// +/// This intrinsic does not correspond to a specific instruction. +/// +/// \param __A +/// A bfloat data. +/// \returns A float data whose sign field and exponent field keep unchanged, +/// and fraction field is extended to 23 bits. +static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtsbh_ss(__bfloat16 __A) { + return __builtin_ia32_cvtsbf162ss_32(__A); +} + +/// Convert Two Packed Single Data to One Packed BF16 Data. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTNE2PS2BF16 instructions. +/// +/// \param __A +/// A 512-bit vector of [16 x float]. +/// \param __B +/// A 512-bit vector of [16 x float]. +/// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from +/// conversion of __B, and higher 256 bits come from conversion of __A. +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_cvtne2ps_pbh(__m512 __A, __m512 __B) { + return (__m512bh)__builtin_ia32_cvtne2ps2bf16_512((__v16sf) __A, + (__v16sf) __B); +} + +/// Convert Two Packed Single Data to One Packed BF16 Data. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTNE2PS2BF16 instructions. +/// +/// \param __A +/// A 512-bit vector of [16 x float]. +/// \param __B +/// A 512-bit vector of [16 x float]. +/// \param __W +/// A 512-bit vector of [32 x bfloat]. +/// \param __U +/// A 32-bit mask value specifying what is chosen for each element. +/// A 1 means conversion of __A or __B. A 0 means element from __W. +/// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from +/// conversion of __B, and higher 256 bits come from conversion of __A. +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtne2ps_pbh(__m512bh __W, __mmask32 __U, __m512 __A, __m512 __B) { + return (__m512bh)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_cvtne2ps_pbh(__A, __B), + (__v32hi)__W); +} + +/// Convert Two Packed Single Data to One Packed BF16 Data. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTNE2PS2BF16 instructions. +/// +/// \param __A +/// A 512-bit vector of [16 x float]. +/// \param __B +/// A 512-bit vector of [16 x float]. +/// \param __U +/// A 32-bit mask value specifying what is chosen for each element. +/// A 1 means conversion of __A or __B. A 0 means element is zero. +/// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from +/// conversion of __B, and higher 256 bits come from conversion of __A. +static __inline__ __m512bh __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtne2ps_pbh(__mmask32 __U, __m512 __A, __m512 __B) { + return (__m512bh)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_cvtne2ps_pbh(__A, __B), + (__v32hi)_mm512_setzero_si512()); +} + +/// Convert Packed Single Data to Packed BF16 Data. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTNEPS2BF16 instructions. +/// +/// \param __A +/// A 512-bit vector of [16 x float]. +/// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A. +static __inline__ __m256bh __DEFAULT_FN_ATTRS512 +_mm512_cvtneps_pbh(__m512 __A) { + return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A, + (__v16hi)_mm256_undefined_si256(), + (__mmask16)-1); +} + +/// Convert Packed Single Data to Packed BF16 Data. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTNEPS2BF16 instructions. +/// +/// \param __A +/// A 512-bit vector of [16 x float]. +/// \param __W +/// A 256-bit vector of [16 x bfloat]. +/// \param __U +/// A 16-bit mask value specifying what is chosen for each element. +/// A 1 means conversion of __A. A 0 means element from __W. +/// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A. +static __inline__ __m256bh __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtneps_pbh(__m256bh __W, __mmask16 __U, __m512 __A) { + return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A, + (__v16hi)__W, + (__mmask16)__U); +} + +/// Convert Packed Single Data to Packed BF16 Data. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTNEPS2BF16 instructions. +/// +/// \param __A +/// A 512-bit vector of [16 x float]. +/// \param __U +/// A 16-bit mask value specifying what is chosen for each element. +/// A 1 means conversion of __A. A 0 means element is zero. +/// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A. +static __inline__ __m256bh __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtneps_pbh(__mmask16 __U, __m512 __A) { + return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A, + (__v16hi)_mm256_setzero_si256(), + (__mmask16)__U); +} + +/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VDPBF16PS instructions. +/// +/// \param __A +/// A 512-bit vector of [32 x bfloat]. +/// \param __B +/// A 512-bit vector of [32 x bfloat]. +/// \param __D +/// A 512-bit vector of [16 x float]. +/// \returns A 512-bit vector of [16 x float] comes from Dot Product of +/// __A, __B and __D +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_dpbf16_ps(__m512 __D, __m512bh __A, __m512bh __B) { + return (__m512)__builtin_ia32_dpbf16ps_512((__v16sf) __D, + (__v16si) __A, + (__v16si) __B); +} + +/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VDPBF16PS instructions. +/// +/// \param __A +/// A 512-bit vector of [32 x bfloat]. +/// \param __B +/// A 512-bit vector of [32 x bfloat]. +/// \param __D +/// A 512-bit vector of [16 x float]. +/// \param __U +/// A 16-bit mask value specifying what is chosen for each element. +/// A 1 means __A and __B's dot product accumulated with __D. A 0 means __D. +/// \returns A 512-bit vector of [16 x float] comes from Dot Product of +/// __A, __B and __D +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask_dpbf16_ps(__m512 __D, __mmask16 __U, __m512bh __A, __m512bh __B) { + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_dpbf16_ps(__D, __A, __B), + (__v16sf)__D); +} + +/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VDPBF16PS instructions. +/// +/// \param __A +/// A 512-bit vector of [32 x bfloat]. +/// \param __B +/// A 512-bit vector of [32 x bfloat]. +/// \param __D +/// A 512-bit vector of [16 x float]. +/// \param __U +/// A 16-bit mask value specifying what is chosen for each element. +/// A 1 means __A and __B's dot product accumulated with __D. A 0 means 0. +/// \returns A 512-bit vector of [16 x float] comes from Dot Product of +/// __A, __B and __D +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_maskz_dpbf16_ps(__mmask16 __U, __m512 __D, __m512bh __A, __m512bh __B) { + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_dpbf16_ps(__D, __A, __B), + (__v16sf)_mm512_setzero_si512()); +} + +/// Convert Packed BF16 Data to Packed float Data. +/// +/// \headerfile +/// +/// \param __A +/// A 256-bit vector of [16 x bfloat]. +/// \returns A 512-bit vector of [16 x float] come from conversion of __A +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtpbh_ps(__m256bh __A) { + return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32( + (__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16)); +} + +/// Convert Packed BF16 Data to Packed float Data using zeroing mask. +/// +/// \headerfile +/// +/// \param __U +/// A 16-bit mask. Elements are zeroed out when the corresponding mask +/// bit is not set. +/// \param __A +/// A 256-bit vector of [16 x bfloat]. +/// \returns A 512-bit vector of [16 x float] come from conversion of __A +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtpbh_ps(__mmask16 __U, __m256bh __A) { + return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32( + (__m512i)_mm512_maskz_cvtepi16_epi32((__mmask16)__U, (__m256i)__A), 16)); +} + +/// Convert Packed BF16 Data to Packed float Data using merging mask. +/// +/// \headerfile +/// +/// \param __S +/// A 512-bit vector of [16 x float]. Elements are copied from __S when +/// the corresponding mask bit is not set. +/// \param __U +/// A 16-bit mask. +/// \param __A +/// A 256-bit vector of [16 x bfloat]. +/// \returns A 512-bit vector of [16 x float] come from conversion of __A +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtpbh_ps(__m512 __S, __mmask16 __U, __m256bh __A) { + return _mm512_castsi512_ps((__m512i)_mm512_mask_slli_epi32( + (__m512i)__S, (__mmask16)__U, + (__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16)); +} + +#undef __DEFAULT_FN_ATTRS +#undef __DEFAULT_FN_ATTRS512 + +#endif diff --git a/include-llvm/avx512bitalgintrin.h b/include-llvm/avx512bitalgintrin.h new file mode 100644 index 0000000..d4411d1 --- /dev/null +++ b/include-llvm/avx512bitalgintrin.h @@ -0,0 +1,83 @@ +/*===------------- avx512bitalgintrin.h - BITALG intrinsics ------------------=== + * + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __AVX512BITALGINTRIN_H +#define __AVX512BITALGINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512bitalg"), __min_vector_width__(512))) + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_popcnt_epi16(__m512i __A) +{ + return (__m512i) __builtin_ia32_vpopcntw_512((__v32hi) __A); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_popcnt_epi16(__m512i __A, __mmask32 __U, __m512i __B) +{ + return (__m512i) __builtin_ia32_selectw_512((__mmask32) __U, + (__v32hi) _mm512_popcnt_epi16(__B), + (__v32hi) __A); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_popcnt_epi16(__mmask32 __U, __m512i __B) +{ + return _mm512_mask_popcnt_epi16((__m512i) _mm512_setzero_si512(), + __U, + __B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_popcnt_epi8(__m512i __A) +{ + return (__m512i) __builtin_ia32_vpopcntb_512((__v64qi) __A); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_popcnt_epi8(__m512i __A, __mmask64 __U, __m512i __B) +{ + return (__m512i) __builtin_ia32_selectb_512((__mmask64) __U, + (__v64qi) _mm512_popcnt_epi8(__B), + (__v64qi) __A); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_popcnt_epi8(__mmask64 __U, __m512i __B) +{ + return _mm512_mask_popcnt_epi8((__m512i) _mm512_setzero_si512(), + __U, + __B); +} + +static __inline__ __mmask64 __DEFAULT_FN_ATTRS +_mm512_mask_bitshuffle_epi64_mask(__mmask64 __U, __m512i __A, __m512i __B) +{ + return (__mmask64) __builtin_ia32_vpshufbitqmb512_mask((__v64qi) __A, + (__v64qi) __B, + __U); +} + +static __inline__ __mmask64 __DEFAULT_FN_ATTRS +_mm512_bitshuffle_epi64_mask(__m512i __A, __m512i __B) +{ + return _mm512_mask_bitshuffle_epi64_mask((__mmask64) -1, + __A, + __B); +} + + +#undef __DEFAULT_FN_ATTRS + +#endif diff --git a/include-llvm/avx512bwintrin.h b/include-llvm/avx512bwintrin.h new file mode 100644 index 0000000..717b92b --- /dev/null +++ b/include-llvm/avx512bwintrin.h @@ -0,0 +1,2104 @@ +/*===------------- avx512bwintrin.h - AVX512BW intrinsics ------------------=== + * + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __AVX512BWINTRIN_H +#define __AVX512BWINTRIN_H + +typedef unsigned int __mmask32; +typedef unsigned long long __mmask64; + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS512 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw"), __min_vector_width__(512))) +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512bw"))) + +static __inline __mmask32 __DEFAULT_FN_ATTRS +_knot_mask32(__mmask32 __M) +{ + return __builtin_ia32_knotsi(__M); +} + +static __inline __mmask64 __DEFAULT_FN_ATTRS +_knot_mask64(__mmask64 __M) +{ + return __builtin_ia32_knotdi(__M); +} + +static __inline__ __mmask32 __DEFAULT_FN_ATTRS +_kand_mask32(__mmask32 __A, __mmask32 __B) +{ + return (__mmask32)__builtin_ia32_kandsi((__mmask32)__A, (__mmask32)__B); +} + +static __inline__ __mmask64 __DEFAULT_FN_ATTRS +_kand_mask64(__mmask64 __A, __mmask64 __B) +{ + return (__mmask64)__builtin_ia32_kanddi((__mmask64)__A, (__mmask64)__B); +} + +static __inline__ __mmask32 __DEFAULT_FN_ATTRS +_kandn_mask32(__mmask32 __A, __mmask32 __B) +{ + return (__mmask32)__builtin_ia32_kandnsi((__mmask32)__A, (__mmask32)__B); +} + +static __inline__ __mmask64 __DEFAULT_FN_ATTRS +_kandn_mask64(__mmask64 __A, __mmask64 __B) +{ + return (__mmask64)__builtin_ia32_kandndi((__mmask64)__A, (__mmask64)__B); +} + +static __inline__ __mmask32 __DEFAULT_FN_ATTRS +_kor_mask32(__mmask32 __A, __mmask32 __B) +{ + return (__mmask32)__builtin_ia32_korsi((__mmask32)__A, (__mmask32)__B); +} + +static __inline__ __mmask64 __DEFAULT_FN_ATTRS +_kor_mask64(__mmask64 __A, __mmask64 __B) +{ + return (__mmask64)__builtin_ia32_kordi((__mmask64)__A, (__mmask64)__B); +} + +static __inline__ __mmask32 __DEFAULT_FN_ATTRS +_kxnor_mask32(__mmask32 __A, __mmask32 __B) +{ + return (__mmask32)__builtin_ia32_kxnorsi((__mmask32)__A, (__mmask32)__B); +} + +static __inline__ __mmask64 __DEFAULT_FN_ATTRS +_kxnor_mask64(__mmask64 __A, __mmask64 __B) +{ + return (__mmask64)__builtin_ia32_kxnordi((__mmask64)__A, (__mmask64)__B); +} + +static __inline__ __mmask32 __DEFAULT_FN_ATTRS +_kxor_mask32(__mmask32 __A, __mmask32 __B) +{ + return (__mmask32)__builtin_ia32_kxorsi((__mmask32)__A, (__mmask32)__B); +} + +static __inline__ __mmask64 __DEFAULT_FN_ATTRS +_kxor_mask64(__mmask64 __A, __mmask64 __B) +{ + return (__mmask64)__builtin_ia32_kxordi((__mmask64)__A, (__mmask64)__B); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_kortestc_mask32_u8(__mmask32 __A, __mmask32 __B) +{ + return (unsigned char)__builtin_ia32_kortestcsi(__A, __B); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_kortestz_mask32_u8(__mmask32 __A, __mmask32 __B) +{ + return (unsigned char)__builtin_ia32_kortestzsi(__A, __B); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_kortest_mask32_u8(__mmask32 __A, __mmask32 __B, unsigned char *__C) { + *__C = (unsigned char)__builtin_ia32_kortestcsi(__A, __B); + return (unsigned char)__builtin_ia32_kortestzsi(__A, __B); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_kortestc_mask64_u8(__mmask64 __A, __mmask64 __B) +{ + return (unsigned char)__builtin_ia32_kortestcdi(__A, __B); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_kortestz_mask64_u8(__mmask64 __A, __mmask64 __B) +{ + return (unsigned char)__builtin_ia32_kortestzdi(__A, __B); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_kortest_mask64_u8(__mmask64 __A, __mmask64 __B, unsigned char *__C) { + *__C = (unsigned char)__builtin_ia32_kortestcdi(__A, __B); + return (unsigned char)__builtin_ia32_kortestzdi(__A, __B); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_ktestc_mask32_u8(__mmask32 __A, __mmask32 __B) +{ + return (unsigned char)__builtin_ia32_ktestcsi(__A, __B); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_ktestz_mask32_u8(__mmask32 __A, __mmask32 __B) +{ + return (unsigned char)__builtin_ia32_ktestzsi(__A, __B); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_ktest_mask32_u8(__mmask32 __A, __mmask32 __B, unsigned char *__C) { + *__C = (unsigned char)__builtin_ia32_ktestcsi(__A, __B); + return (unsigned char)__builtin_ia32_ktestzsi(__A, __B); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_ktestc_mask64_u8(__mmask64 __A, __mmask64 __B) +{ + return (unsigned char)__builtin_ia32_ktestcdi(__A, __B); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_ktestz_mask64_u8(__mmask64 __A, __mmask64 __B) +{ + return (unsigned char)__builtin_ia32_ktestzdi(__A, __B); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_ktest_mask64_u8(__mmask64 __A, __mmask64 __B, unsigned char *__C) { + *__C = (unsigned char)__builtin_ia32_ktestcdi(__A, __B); + return (unsigned char)__builtin_ia32_ktestzdi(__A, __B); +} + +static __inline__ __mmask32 __DEFAULT_FN_ATTRS +_kadd_mask32(__mmask32 __A, __mmask32 __B) +{ + return (__mmask32)__builtin_ia32_kaddsi((__mmask32)__A, (__mmask32)__B); +} + +static __inline__ __mmask64 __DEFAULT_FN_ATTRS +_kadd_mask64(__mmask64 __A, __mmask64 __B) +{ + return (__mmask64)__builtin_ia32_kadddi((__mmask64)__A, (__mmask64)__B); +} + +#define _kshiftli_mask32(A, I) \ + ((__mmask32)__builtin_ia32_kshiftlisi((__mmask32)(A), (unsigned int)(I))) + +#define _kshiftri_mask32(A, I) \ + ((__mmask32)__builtin_ia32_kshiftrisi((__mmask32)(A), (unsigned int)(I))) + +#define _kshiftli_mask64(A, I) \ + ((__mmask64)__builtin_ia32_kshiftlidi((__mmask64)(A), (unsigned int)(I))) + +#define _kshiftri_mask64(A, I) \ + ((__mmask64)__builtin_ia32_kshiftridi((__mmask64)(A), (unsigned int)(I))) + +static __inline__ unsigned int __DEFAULT_FN_ATTRS +_cvtmask32_u32(__mmask32 __A) { + return (unsigned int)__builtin_ia32_kmovd((__mmask32)__A); +} + +static __inline__ unsigned long long __DEFAULT_FN_ATTRS +_cvtmask64_u64(__mmask64 __A) { + return (unsigned long long)__builtin_ia32_kmovq((__mmask64)__A); +} + +static __inline__ __mmask32 __DEFAULT_FN_ATTRS +_cvtu32_mask32(unsigned int __A) { + return (__mmask32)__builtin_ia32_kmovd((__mmask32)__A); +} + +static __inline__ __mmask64 __DEFAULT_FN_ATTRS +_cvtu64_mask64(unsigned long long __A) { + return (__mmask64)__builtin_ia32_kmovq((__mmask64)__A); +} + +static __inline__ __mmask32 __DEFAULT_FN_ATTRS +_load_mask32(__mmask32 *__A) { + return (__mmask32)__builtin_ia32_kmovd(*(__mmask32 *)__A); +} + +static __inline__ __mmask64 __DEFAULT_FN_ATTRS +_load_mask64(__mmask64 *__A) { + return (__mmask64)__builtin_ia32_kmovq(*(__mmask64 *)__A); +} + +static __inline__ void __DEFAULT_FN_ATTRS +_store_mask32(__mmask32 *__A, __mmask32 __B) { + *(__mmask32 *)__A = __builtin_ia32_kmovd((__mmask32)__B); +} + +static __inline__ void __DEFAULT_FN_ATTRS +_store_mask64(__mmask64 *__A, __mmask64 __B) { + *(__mmask64 *)__A = __builtin_ia32_kmovq((__mmask64)__B); +} + +/* Integer compare */ + +#define _mm512_cmp_epi8_mask(a, b, p) \ + ((__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \ + (__v64qi)(__m512i)(b), (int)(p), \ + (__mmask64)-1)) + +#define _mm512_mask_cmp_epi8_mask(m, a, b, p) \ + ((__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \ + (__v64qi)(__m512i)(b), (int)(p), \ + (__mmask64)(m))) + +#define _mm512_cmp_epu8_mask(a, b, p) \ + ((__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \ + (__v64qi)(__m512i)(b), (int)(p), \ + (__mmask64)-1)) + +#define _mm512_mask_cmp_epu8_mask(m, a, b, p) \ + ((__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \ + (__v64qi)(__m512i)(b), (int)(p), \ + (__mmask64)(m))) + +#define _mm512_cmp_epi16_mask(a, b, p) \ + ((__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \ + (__v32hi)(__m512i)(b), (int)(p), \ + (__mmask32)-1)) + +#define _mm512_mask_cmp_epi16_mask(m, a, b, p) \ + ((__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \ + (__v32hi)(__m512i)(b), (int)(p), \ + (__mmask32)(m))) + +#define _mm512_cmp_epu16_mask(a, b, p) \ + ((__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \ + (__v32hi)(__m512i)(b), (int)(p), \ + (__mmask32)-1)) + +#define _mm512_mask_cmp_epu16_mask(m, a, b, p) \ + ((__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \ + (__v32hi)(__m512i)(b), (int)(p), \ + (__mmask32)(m))) + +#define _mm512_cmpeq_epi8_mask(A, B) \ + _mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_EQ) +#define _mm512_mask_cmpeq_epi8_mask(k, A, B) \ + _mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm512_cmpge_epi8_mask(A, B) \ + _mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_GE) +#define _mm512_mask_cmpge_epi8_mask(k, A, B) \ + _mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm512_cmpgt_epi8_mask(A, B) \ + _mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_GT) +#define _mm512_mask_cmpgt_epi8_mask(k, A, B) \ + _mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm512_cmple_epi8_mask(A, B) \ + _mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_LE) +#define _mm512_mask_cmple_epi8_mask(k, A, B) \ + _mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm512_cmplt_epi8_mask(A, B) \ + _mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_LT) +#define _mm512_mask_cmplt_epi8_mask(k, A, B) \ + _mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm512_cmpneq_epi8_mask(A, B) \ + _mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_NE) +#define _mm512_mask_cmpneq_epi8_mask(k, A, B) \ + _mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_NE) + +#define _mm512_cmpeq_epu8_mask(A, B) \ + _mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_EQ) +#define _mm512_mask_cmpeq_epu8_mask(k, A, B) \ + _mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm512_cmpge_epu8_mask(A, B) \ + _mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_GE) +#define _mm512_mask_cmpge_epu8_mask(k, A, B) \ + _mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm512_cmpgt_epu8_mask(A, B) \ + _mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_GT) +#define _mm512_mask_cmpgt_epu8_mask(k, A, B) \ + _mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm512_cmple_epu8_mask(A, B) \ + _mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_LE) +#define _mm512_mask_cmple_epu8_mask(k, A, B) \ + _mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm512_cmplt_epu8_mask(A, B) \ + _mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_LT) +#define _mm512_mask_cmplt_epu8_mask(k, A, B) \ + _mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm512_cmpneq_epu8_mask(A, B) \ + _mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_NE) +#define _mm512_mask_cmpneq_epu8_mask(k, A, B) \ + _mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_NE) + +#define _mm512_cmpeq_epi16_mask(A, B) \ + _mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_EQ) +#define _mm512_mask_cmpeq_epi16_mask(k, A, B) \ + _mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm512_cmpge_epi16_mask(A, B) \ + _mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_GE) +#define _mm512_mask_cmpge_epi16_mask(k, A, B) \ + _mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm512_cmpgt_epi16_mask(A, B) \ + _mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_GT) +#define _mm512_mask_cmpgt_epi16_mask(k, A, B) \ + _mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm512_cmple_epi16_mask(A, B) \ + _mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_LE) +#define _mm512_mask_cmple_epi16_mask(k, A, B) \ + _mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm512_cmplt_epi16_mask(A, B) \ + _mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_LT) +#define _mm512_mask_cmplt_epi16_mask(k, A, B) \ + _mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm512_cmpneq_epi16_mask(A, B) \ + _mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_NE) +#define _mm512_mask_cmpneq_epi16_mask(k, A, B) \ + _mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_NE) + +#define _mm512_cmpeq_epu16_mask(A, B) \ + _mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_EQ) +#define _mm512_mask_cmpeq_epu16_mask(k, A, B) \ + _mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm512_cmpge_epu16_mask(A, B) \ + _mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_GE) +#define _mm512_mask_cmpge_epu16_mask(k, A, B) \ + _mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm512_cmpgt_epu16_mask(A, B) \ + _mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_GT) +#define _mm512_mask_cmpgt_epu16_mask(k, A, B) \ + _mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm512_cmple_epu16_mask(A, B) \ + _mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_LE) +#define _mm512_mask_cmple_epu16_mask(k, A, B) \ + _mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm512_cmplt_epu16_mask(A, B) \ + _mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_LT) +#define _mm512_mask_cmplt_epu16_mask(k, A, B) \ + _mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm512_cmpneq_epu16_mask(A, B) \ + _mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_NE) +#define _mm512_mask_cmpneq_epu16_mask(k, A, B) \ + _mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_NE) + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_add_epi8 (__m512i __A, __m512i __B) { + return (__m512i) ((__v64qu) __A + (__v64qu) __B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_add_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, + (__v64qi)_mm512_add_epi8(__A, __B), + (__v64qi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_add_epi8(__mmask64 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, + (__v64qi)_mm512_add_epi8(__A, __B), + (__v64qi)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_sub_epi8 (__m512i __A, __m512i __B) { + return (__m512i) ((__v64qu) __A - (__v64qu) __B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_sub_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, + (__v64qi)_mm512_sub_epi8(__A, __B), + (__v64qi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_sub_epi8(__mmask64 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, + (__v64qi)_mm512_sub_epi8(__A, __B), + (__v64qi)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_add_epi16 (__m512i __A, __m512i __B) { + return (__m512i) ((__v32hu) __A + (__v32hu) __B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_add_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_add_epi16(__A, __B), + (__v32hi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_add_epi16(__mmask32 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_add_epi16(__A, __B), + (__v32hi)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_sub_epi16 (__m512i __A, __m512i __B) { + return (__m512i) ((__v32hu) __A - (__v32hu) __B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_sub_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_sub_epi16(__A, __B), + (__v32hi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_sub_epi16(__mmask32 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_sub_epi16(__A, __B), + (__v32hi)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mullo_epi16 (__m512i __A, __m512i __B) { + return (__m512i) ((__v32hu) __A * (__v32hu) __B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_mullo_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_mullo_epi16(__A, __B), + (__v32hi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_mullo_epi16(__mmask32 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_mullo_epi16(__A, __B), + (__v32hi)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_blend_epi8 (__mmask64 __U, __m512i __A, __m512i __W) +{ + return (__m512i) __builtin_ia32_selectb_512 ((__mmask64) __U, + (__v64qi) __W, + (__v64qi) __A); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_blend_epi16 (__mmask32 __U, __m512i __A, __m512i __W) +{ + return (__m512i) __builtin_ia32_selectw_512 ((__mmask32) __U, + (__v32hi) __W, + (__v32hi) __A); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_abs_epi8 (__m512i __A) +{ +#if (__clang_major__ < 14) + return (__m512i)__builtin_ia32_pabsb512((__v64qi)__A); +#else + return (__m512i)__builtin_elementwise_abs((__v64qs)__A); +#endif +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_abs_epi8 (__m512i __W, __mmask64 __U, __m512i __A) +{ + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, + (__v64qi)_mm512_abs_epi8(__A), + (__v64qi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_abs_epi8 (__mmask64 __U, __m512i __A) +{ + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, + (__v64qi)_mm512_abs_epi8(__A), + (__v64qi)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_abs_epi16 (__m512i __A) +{ +#if (__clang_major__ < 14) + return (__m512i)__builtin_ia32_pabsw512((__v32hi)__A); +#else + return (__m512i)__builtin_elementwise_abs((__v32hi)__A); +#endif +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_abs_epi16 (__m512i __W, __mmask32 __U, __m512i __A) +{ + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_abs_epi16(__A), + (__v32hi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_abs_epi16 (__mmask32 __U, __m512i __A) +{ + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_abs_epi16(__A), + (__v32hi)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_packs_epi32(__m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_packssdw512((__v16si)__A, (__v16si)__B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_packs_epi32(__mmask32 __M, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, + (__v32hi)_mm512_packs_epi32(__A, __B), + (__v32hi)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_packs_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, + (__v32hi)_mm512_packs_epi32(__A, __B), + (__v32hi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_packs_epi16(__m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_packsswb512((__v32hi)__A, (__v32hi) __B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_packs_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, + (__v64qi)_mm512_packs_epi16(__A, __B), + (__v64qi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_packs_epi16(__mmask64 __M, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, + (__v64qi)_mm512_packs_epi16(__A, __B), + (__v64qi)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_packus_epi32(__m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_packusdw512((__v16si) __A, (__v16si) __B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_packus_epi32(__mmask32 __M, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, + (__v32hi)_mm512_packus_epi32(__A, __B), + (__v32hi)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_packus_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, + (__v32hi)_mm512_packus_epi32(__A, __B), + (__v32hi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_packus_epi16(__m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_packuswb512((__v32hi) __A, (__v32hi) __B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_packus_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, + (__v64qi)_mm512_packus_epi16(__A, __B), + (__v64qi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_packus_epi16(__mmask64 __M, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, + (__v64qi)_mm512_packus_epi16(__A, __B), + (__v64qi)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_adds_epi8 (__m512i __A, __m512i __B) +{ +#if (__clang_major__ > 14) + return (__m512i)__builtin_elementwise_add_sat((__v64qs)__A, (__v64qs)__B); +#else + return (__m512i)__builtin_ia32_paddsb512((__v64qi)__A, (__v64qi)__B); +#endif +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_adds_epi8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, + (__v64qi)_mm512_adds_epi8(__A, __B), + (__v64qi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_adds_epi8 (__mmask64 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, + (__v64qi)_mm512_adds_epi8(__A, __B), + (__v64qi)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_adds_epi16 (__m512i __A, __m512i __B) +{ +#if (__clang_major__ > 14) + return (__m512i)__builtin_elementwise_add_sat((__v32hi)__A, (__v32hi)__B); +#else + return (__m512i)__builtin_ia32_paddsw512((__v32hi)__A, (__v32hi)__B); +#endif +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_adds_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_adds_epi16(__A, __B), + (__v32hi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_adds_epi16 (__mmask32 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_adds_epi16(__A, __B), + (__v32hi)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_adds_epu8 (__m512i __A, __m512i __B) +{ +#if (__clang_major__ > 14) + return (__m512i)__builtin_elementwise_add_sat((__v64qu) __A, (__v64qu) __B); +#else + return (__m512i)__builtin_ia32_paddusb512((__v64qi) __A, (__v64qi) __B); +#endif +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_adds_epu8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, + (__v64qi)_mm512_adds_epu8(__A, __B), + (__v64qi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_adds_epu8 (__mmask64 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, + (__v64qi)_mm512_adds_epu8(__A, __B), + (__v64qi)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_adds_epu16 (__m512i __A, __m512i __B) +{ +#if (__clang_major__ > 14) + return (__m512i)__builtin_elementwise_add_sat((__v32hu) __A, (__v32hu) __B); +#else + return (__m512i)__builtin_ia32_paddusw512((__v32hi) __A, (__v32hi) __B); +#endif +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_adds_epu16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_adds_epu16(__A, __B), + (__v32hi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_adds_epu16 (__mmask32 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_adds_epu16(__A, __B), + (__v32hi)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_avg_epu8 (__m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_pavgb512((__v64qi)__A, (__v64qi)__B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_avg_epu8 (__m512i __W, __mmask64 __U, __m512i __A, + __m512i __B) +{ + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, + (__v64qi)_mm512_avg_epu8(__A, __B), + (__v64qi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_avg_epu8 (__mmask64 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, + (__v64qi)_mm512_avg_epu8(__A, __B), + (__v64qi)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_avg_epu16 (__m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_pavgw512((__v32hi)__A, (__v32hi)__B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_avg_epu16 (__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) +{ + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_avg_epu16(__A, __B), + (__v32hi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_avg_epu16 (__mmask32 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_avg_epu16(__A, __B), + (__v32hi) _mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_max_epi8 (__m512i __A, __m512i __B) +{ +#if (__clang_major__ < 14) + return (__m512i)__builtin_ia32_pmaxsb512((__v64qi) __A, (__v64qi) __B); +#else + return (__m512i)__builtin_elementwise_max((__v64qs) __A, (__v64qs) __B); +#endif +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_max_epi8 (__mmask64 __M, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, + (__v64qi)_mm512_max_epi8(__A, __B), + (__v64qi)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_max_epi8 (__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, + (__v64qi)_mm512_max_epi8(__A, __B), + (__v64qi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_max_epi16 (__m512i __A, __m512i __B) +{ +#if (__clang_major__ < 14) + return (__m512i)__builtin_ia32_pmaxsw512((__v32hi) __A, (__v32hi) __B); +#else + return (__m512i)__builtin_elementwise_max((__v32hi) __A, (__v32hi) __B); +#endif +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_max_epi16 (__mmask32 __M, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, + (__v32hi)_mm512_max_epi16(__A, __B), + (__v32hi)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_max_epi16 (__m512i __W, __mmask32 __M, __m512i __A, + __m512i __B) +{ + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, + (__v32hi)_mm512_max_epi16(__A, __B), + (__v32hi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_max_epu8 (__m512i __A, __m512i __B) +{ +#if (__clang_major__ < 14) + return (__m512i)__builtin_ia32_pmaxub512((__v64qi)__A, (__v64qi)__B); +#else + return (__m512i)__builtin_elementwise_max((__v64qu)__A, (__v64qu)__B); +#endif +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_max_epu8 (__mmask64 __M, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, + (__v64qi)_mm512_max_epu8(__A, __B), + (__v64qi)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_max_epu8 (__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, + (__v64qi)_mm512_max_epu8(__A, __B), + (__v64qi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_max_epu16 (__m512i __A, __m512i __B) +{ +#if (__clang_major__ < 14) + return (__m512i)__builtin_ia32_pmaxuw512((__v32hi)__A, (__v32hi)__B); +#else + return (__m512i)__builtin_elementwise_max((__v32hu)__A, (__v32hu)__B); +#endif +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_max_epu16 (__mmask32 __M, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, + (__v32hi)_mm512_max_epu16(__A, __B), + (__v32hi)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_max_epu16 (__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, + (__v32hi)_mm512_max_epu16(__A, __B), + (__v32hi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_min_epi8 (__m512i __A, __m512i __B) +{ +#if (__clang_major__ < 14) + return (__m512i)__builtin_ia32_pminsb512((__v64qi) __A, (__v64qi) __B); +#else + return (__m512i)__builtin_elementwise_min((__v64qs) __A, (__v64qs) __B); +#endif +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_min_epi8 (__mmask64 __M, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, + (__v64qi)_mm512_min_epi8(__A, __B), + (__v64qi)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_min_epi8 (__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, + (__v64qi)_mm512_min_epi8(__A, __B), + (__v64qi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_min_epi16 (__m512i __A, __m512i __B) +{ +#if (__clang_major__ < 14) + return (__m512i)__builtin_ia32_pminsw512((__v32hi) __A, (__v32hi) __B); +#else + return (__m512i)__builtin_elementwise_min((__v32hi) __A, (__v32hi) __B); +#endif +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_min_epi16 (__mmask32 __M, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, + (__v32hi)_mm512_min_epi16(__A, __B), + (__v32hi)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_min_epi16 (__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, + (__v32hi)_mm512_min_epi16(__A, __B), + (__v32hi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_min_epu8 (__m512i __A, __m512i __B) +{ +#if (__clang_major__ < 14) + return (__m512i)__builtin_ia32_pminub512((__v64qi)__A, (__v64qi)__B); +#else + return (__m512i)__builtin_elementwise_min((__v64qu)__A, (__v64qu)__B); +#endif +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_min_epu8 (__mmask64 __M, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, + (__v64qi)_mm512_min_epu8(__A, __B), + (__v64qi)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_min_epu8 (__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, + (__v64qi)_mm512_min_epu8(__A, __B), + (__v64qi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_min_epu16 (__m512i __A, __m512i __B) +{ +#if (__clang_major__ < 14) + return (__m512i)__builtin_ia32_pminuw512((__v32hi)__A, (__v32hi)__B); +#else + return (__m512i)__builtin_elementwise_min((__v32hu)__A, (__v32hu)__B); +#endif +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_min_epu16 (__mmask32 __M, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, + (__v32hi)_mm512_min_epu16(__A, __B), + (__v32hi)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_min_epu16 (__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, + (__v32hi)_mm512_min_epu16(__A, __B), + (__v32hi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_shuffle_epi8(__m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_pshufb512((__v64qi)__A,(__v64qi)__B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_shuffle_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, + (__v64qi)_mm512_shuffle_epi8(__A, __B), + (__v64qi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_shuffle_epi8(__mmask64 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, + (__v64qi)_mm512_shuffle_epi8(__A, __B), + (__v64qi)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_subs_epi8 (__m512i __A, __m512i __B) +{ +#if (__clang_major__ > 14) + return (__m512i)__builtin_elementwise_sub_sat((__v64qs)__A, (__v64qs)__B); +#else + return (__m512i)__builtin_ia32_psubsb512((__v64qi)__A, (__v64qi)__B); +#endif +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_subs_epi8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, + (__v64qi)_mm512_subs_epi8(__A, __B), + (__v64qi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_subs_epi8 (__mmask64 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, + (__v64qi)_mm512_subs_epi8(__A, __B), + (__v64qi)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_subs_epi16 (__m512i __A, __m512i __B) +{ +#if (__clang_major__ > 14) + return (__m512i)__builtin_elementwise_sub_sat((__v32hi)__A, (__v32hi)__B); +#else + return (__m512i)__builtin_ia32_psubsw512((__v32hi)__A, (__v32hi)__B); +#endif +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_subs_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_subs_epi16(__A, __B), + (__v32hi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_subs_epi16 (__mmask32 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_subs_epi16(__A, __B), + (__v32hi)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_subs_epu8 (__m512i __A, __m512i __B) +{ +#if (__clang_major__ > 14) + return (__m512i)__builtin_elementwise_sub_sat((__v64qu) __A, (__v64qu) __B); +#else + return (__m512i)__builtin_ia32_psubusb512((__v64qi) __A, (__v64qi) __B); +#endif +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_subs_epu8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, + (__v64qi)_mm512_subs_epu8(__A, __B), + (__v64qi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_subs_epu8 (__mmask64 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, + (__v64qi)_mm512_subs_epu8(__A, __B), + (__v64qi)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_subs_epu16 (__m512i __A, __m512i __B) +{ +#if (__clang_major__ > 14) + return (__m512i)__builtin_elementwise_sub_sat((__v32hu) __A, (__v32hu) __B); +#else + return (__m512i)__builtin_ia32_psubusw512((__v32hi) __A, (__v32hi) __B); +#endif +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_subs_epu16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_subs_epu16(__A, __B), + (__v32hi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_subs_epu16 (__mmask32 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_subs_epu16(__A, __B), + (__v32hi)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_permutex2var_epi16(__m512i __A, __m512i __I, __m512i __B) +{ + return (__m512i)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I, + (__v32hi)__B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_permutex2var_epi16(__m512i __A, __mmask32 __U, __m512i __I, + __m512i __B) +{ + return (__m512i)__builtin_ia32_selectw_512(__U, + (__v32hi)_mm512_permutex2var_epi16(__A, __I, __B), + (__v32hi)__A); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask2_permutex2var_epi16(__m512i __A, __m512i __I, __mmask32 __U, + __m512i __B) +{ + return (__m512i)__builtin_ia32_selectw_512(__U, + (__v32hi)_mm512_permutex2var_epi16(__A, __I, __B), + (__v32hi)__I); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_permutex2var_epi16(__mmask32 __U, __m512i __A, __m512i __I, + __m512i __B) +{ + return (__m512i)__builtin_ia32_selectw_512(__U, + (__v32hi)_mm512_permutex2var_epi16(__A, __I, __B), + (__v32hi)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mulhrs_epi16(__m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_pmulhrsw512((__v32hi)__A, (__v32hi)__B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_mulhrs_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_mulhrs_epi16(__A, __B), + (__v32hi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_mulhrs_epi16(__mmask32 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_mulhrs_epi16(__A, __B), + (__v32hi)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mulhi_epi16(__m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_pmulhw512((__v32hi) __A, (__v32hi) __B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_mulhi_epi16(__m512i __W, __mmask32 __U, __m512i __A, + __m512i __B) +{ + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_mulhi_epi16(__A, __B), + (__v32hi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_mulhi_epi16(__mmask32 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_mulhi_epi16(__A, __B), + (__v32hi)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mulhi_epu16(__m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_pmulhuw512((__v32hi) __A, (__v32hi) __B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_mulhi_epu16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_mulhi_epu16(__A, __B), + (__v32hi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_mulhi_epu16 (__mmask32 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_mulhi_epu16(__A, __B), + (__v32hi)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maddubs_epi16(__m512i __X, __m512i __Y) { + return (__m512i)__builtin_ia32_pmaddubsw512((__v64qi)__X, (__v64qi)__Y); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_maddubs_epi16(__m512i __W, __mmask32 __U, __m512i __X, + __m512i __Y) { + return (__m512i)__builtin_ia32_selectw_512((__mmask32) __U, + (__v32hi)_mm512_maddubs_epi16(__X, __Y), + (__v32hi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_maddubs_epi16(__mmask32 __U, __m512i __X, __m512i __Y) { + return (__m512i)__builtin_ia32_selectw_512((__mmask32) __U, + (__v32hi)_mm512_maddubs_epi16(__X, __Y), + (__v32hi)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_madd_epi16(__m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_pmaddwd512((__v32hi)__A, (__v32hi)__B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_madd_epi16(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_madd_epi16(__A, __B), + (__v16si)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_madd_epi16(__mmask16 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_madd_epi16(__A, __B), + (__v16si)_mm512_setzero_si512()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_cvtsepi16_epi8 (__m512i __A) { + return (__m256i) __builtin_ia32_pmovswb512_mask ((__v32hi) __A, + (__v32qi)_mm256_setzero_si256(), + (__mmask32) -1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtsepi16_epi8 (__m256i __O, __mmask32 __M, __m512i __A) { + return (__m256i) __builtin_ia32_pmovswb512_mask ((__v32hi) __A, + (__v32qi)__O, + __M); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtsepi16_epi8 (__mmask32 __M, __m512i __A) { + return (__m256i) __builtin_ia32_pmovswb512_mask ((__v32hi) __A, + (__v32qi) _mm256_setzero_si256(), + __M); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_cvtusepi16_epi8 (__m512i __A) { + return (__m256i) __builtin_ia32_pmovuswb512_mask ((__v32hi) __A, + (__v32qi) _mm256_setzero_si256(), + (__mmask32) -1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtusepi16_epi8 (__m256i __O, __mmask32 __M, __m512i __A) { + return (__m256i) __builtin_ia32_pmovuswb512_mask ((__v32hi) __A, + (__v32qi) __O, + __M); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtusepi16_epi8 (__mmask32 __M, __m512i __A) { + return (__m256i) __builtin_ia32_pmovuswb512_mask ((__v32hi) __A, + (__v32qi) _mm256_setzero_si256(), + __M); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_cvtepi16_epi8 (__m512i __A) { + return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A, + (__v32qi) _mm256_undefined_si256(), + (__mmask32) -1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtepi16_epi8 (__m256i __O, __mmask32 __M, __m512i __A) { + return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A, + (__v32qi) __O, + __M); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtepi16_epi8 (__mmask32 __M, __m512i __A) { + return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A, + (__v32qi) _mm256_setzero_si256(), + __M); +} + +static __inline__ void __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A) +{ + __builtin_ia32_pmovwb512mem_mask ((__v32qi *) __P, (__v32hi) __A, __M); +} + +static __inline__ void __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A) +{ + __builtin_ia32_pmovswb512mem_mask ((__v32qi *) __P, (__v32hi) __A, __M); +} + +static __inline__ void __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A) +{ + __builtin_ia32_pmovuswb512mem_mask ((__v32qi *) __P, (__v32hi) __A, __M); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_unpackhi_epi8(__m512i __A, __m512i __B) { + return (__m512i)__builtin_shufflevector((__v64qi)__A, (__v64qi)__B, + 8, 64+8, 9, 64+9, + 10, 64+10, 11, 64+11, + 12, 64+12, 13, 64+13, + 14, 64+14, 15, 64+15, + 24, 64+24, 25, 64+25, + 26, 64+26, 27, 64+27, + 28, 64+28, 29, 64+29, + 30, 64+30, 31, 64+31, + 40, 64+40, 41, 64+41, + 42, 64+42, 43, 64+43, + 44, 64+44, 45, 64+45, + 46, 64+46, 47, 64+47, + 56, 64+56, 57, 64+57, + 58, 64+58, 59, 64+59, + 60, 64+60, 61, 64+61, + 62, 64+62, 63, 64+63); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_unpackhi_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, + (__v64qi)_mm512_unpackhi_epi8(__A, __B), + (__v64qi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_unpackhi_epi8(__mmask64 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, + (__v64qi)_mm512_unpackhi_epi8(__A, __B), + (__v64qi)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_unpackhi_epi16(__m512i __A, __m512i __B) { + return (__m512i)__builtin_shufflevector((__v32hi)__A, (__v32hi)__B, + 4, 32+4, 5, 32+5, + 6, 32+6, 7, 32+7, + 12, 32+12, 13, 32+13, + 14, 32+14, 15, 32+15, + 20, 32+20, 21, 32+21, + 22, 32+22, 23, 32+23, + 28, 32+28, 29, 32+29, + 30, 32+30, 31, 32+31); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_unpackhi_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_unpackhi_epi16(__A, __B), + (__v32hi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_unpackhi_epi16(__mmask32 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_unpackhi_epi16(__A, __B), + (__v32hi)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_unpacklo_epi8(__m512i __A, __m512i __B) { + return (__m512i)__builtin_shufflevector((__v64qi)__A, (__v64qi)__B, + 0, 64+0, 1, 64+1, + 2, 64+2, 3, 64+3, + 4, 64+4, 5, 64+5, + 6, 64+6, 7, 64+7, + 16, 64+16, 17, 64+17, + 18, 64+18, 19, 64+19, + 20, 64+20, 21, 64+21, + 22, 64+22, 23, 64+23, + 32, 64+32, 33, 64+33, + 34, 64+34, 35, 64+35, + 36, 64+36, 37, 64+37, + 38, 64+38, 39, 64+39, + 48, 64+48, 49, 64+49, + 50, 64+50, 51, 64+51, + 52, 64+52, 53, 64+53, + 54, 64+54, 55, 64+55); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_unpacklo_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, + (__v64qi)_mm512_unpacklo_epi8(__A, __B), + (__v64qi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_unpacklo_epi8(__mmask64 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, + (__v64qi)_mm512_unpacklo_epi8(__A, __B), + (__v64qi)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_unpacklo_epi16(__m512i __A, __m512i __B) { + return (__m512i)__builtin_shufflevector((__v32hi)__A, (__v32hi)__B, + 0, 32+0, 1, 32+1, + 2, 32+2, 3, 32+3, + 8, 32+8, 9, 32+9, + 10, 32+10, 11, 32+11, + 16, 32+16, 17, 32+17, + 18, 32+18, 19, 32+19, + 24, 32+24, 25, 32+25, + 26, 32+26, 27, 32+27); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_unpacklo_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_unpacklo_epi16(__A, __B), + (__v32hi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_unpacklo_epi16(__mmask32 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_unpacklo_epi16(__A, __B), + (__v32hi)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_cvtepi8_epi16(__m256i __A) +{ + /* This function always performs a signed extension, but __v32qi is a char + which may be signed or unsigned, so use __v32qs. */ + return (__m512i)__builtin_convertvector((__v32qs)__A, __v32hi); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtepi8_epi16(__m512i __W, __mmask32 __U, __m256i __A) +{ + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_cvtepi8_epi16(__A), + (__v32hi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtepi8_epi16(__mmask32 __U, __m256i __A) +{ + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_cvtepi8_epi16(__A), + (__v32hi)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_cvtepu8_epi16(__m256i __A) +{ + return (__m512i)__builtin_convertvector((__v32qu)__A, __v32hi); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtepu8_epi16(__m512i __W, __mmask32 __U, __m256i __A) +{ + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_cvtepu8_epi16(__A), + (__v32hi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtepu8_epi16(__mmask32 __U, __m256i __A) +{ + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_cvtepu8_epi16(__A), + (__v32hi)_mm512_setzero_si512()); +} + + +#define _mm512_shufflehi_epi16(A, imm) \ + ((__m512i)__builtin_ia32_pshufhw512((__v32hi)(__m512i)(A), (int)(imm))) + +#define _mm512_mask_shufflehi_epi16(W, U, A, imm) \ + ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ + (__v32hi)_mm512_shufflehi_epi16((A), \ + (imm)), \ + (__v32hi)(__m512i)(W))) + +#define _mm512_maskz_shufflehi_epi16(U, A, imm) \ + ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ + (__v32hi)_mm512_shufflehi_epi16((A), \ + (imm)), \ + (__v32hi)_mm512_setzero_si512())) + +#define _mm512_shufflelo_epi16(A, imm) \ + ((__m512i)__builtin_ia32_pshuflw512((__v32hi)(__m512i)(A), (int)(imm))) + + +#define _mm512_mask_shufflelo_epi16(W, U, A, imm) \ + ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ + (__v32hi)_mm512_shufflelo_epi16((A), \ + (imm)), \ + (__v32hi)(__m512i)(W))) + + +#define _mm512_maskz_shufflelo_epi16(U, A, imm) \ + ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ + (__v32hi)_mm512_shufflelo_epi16((A), \ + (imm)), \ + (__v32hi)_mm512_setzero_si512())) + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_sllv_epi16(__m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_psllv32hi((__v32hi) __A, (__v32hi) __B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_sllv_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_sllv_epi16(__A, __B), + (__v32hi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_sllv_epi16(__mmask32 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_sllv_epi16(__A, __B), + (__v32hi)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_sll_epi16(__m512i __A, __m128i __B) +{ + return (__m512i)__builtin_ia32_psllw512((__v32hi) __A, (__v8hi) __B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_sll_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B) +{ + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_sll_epi16(__A, __B), + (__v32hi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_sll_epi16(__mmask32 __U, __m512i __A, __m128i __B) +{ + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_sll_epi16(__A, __B), + (__v32hi)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_slli_epi16(__m512i __A, unsigned int __B) +{ +#if (__clang_major__ > 14) + return (__m512i)__builtin_ia32_psllwi512((__v32hi)__A, (int)__B); +#else + return (__m512i)__builtin_ia32_psllwi512((__v32hi)__A, __B); +#endif +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_slli_epi16(__m512i __W, __mmask32 __U, __m512i __A, + unsigned int __B) +{ + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_slli_epi16(__A, __B), + (__v32hi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_slli_epi16(__mmask32 __U, __m512i __A, unsigned int __B) +{ + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_slli_epi16(__A, __B), + (__v32hi)_mm512_setzero_si512()); +} + +#define _mm512_bslli_epi128(a, imm) \ + ((__m512i)__builtin_ia32_pslldqi512_byteshift((__v8di)(__m512i)(a), (int)(imm))) + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_srlv_epi16(__m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_psrlv32hi((__v32hi)__A, (__v32hi)__B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_srlv_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_srlv_epi16(__A, __B), + (__v32hi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_srlv_epi16(__mmask32 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_srlv_epi16(__A, __B), + (__v32hi)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_srav_epi16(__m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_psrav32hi((__v32hi)__A, (__v32hi)__B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_srav_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_srav_epi16(__A, __B), + (__v32hi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_srav_epi16(__mmask32 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_srav_epi16(__A, __B), + (__v32hi)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_sra_epi16(__m512i __A, __m128i __B) +{ + return (__m512i)__builtin_ia32_psraw512((__v32hi) __A, (__v8hi) __B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_sra_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B) +{ + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_sra_epi16(__A, __B), + (__v32hi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_sra_epi16(__mmask32 __U, __m512i __A, __m128i __B) +{ + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_sra_epi16(__A, __B), + (__v32hi)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_srai_epi16(__m512i __A, unsigned int __B) +{ +#if (__clang_major__ > 14) + return (__m512i)__builtin_ia32_psrawi512((__v32hi)__A, (int)__B); +#else + return (__m512i)__builtin_ia32_psrawi512((__v32hi)__A, __B); +#endif +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_srai_epi16(__m512i __W, __mmask32 __U, __m512i __A, + unsigned int __B) +{ + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_srai_epi16(__A, __B), + (__v32hi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_srai_epi16(__mmask32 __U, __m512i __A, unsigned int __B) +{ + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_srai_epi16(__A, __B), + (__v32hi)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_srl_epi16(__m512i __A, __m128i __B) +{ + return (__m512i)__builtin_ia32_psrlw512((__v32hi) __A, (__v8hi) __B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_srl_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B) +{ + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_srl_epi16(__A, __B), + (__v32hi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_srl_epi16(__mmask32 __U, __m512i __A, __m128i __B) +{ + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_srl_epi16(__A, __B), + (__v32hi)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_srli_epi16(__m512i __A, unsigned int __B) +{ + return (__m512i)__builtin_ia32_psrlwi512((__v32hi)__A, __B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_srli_epi16(__m512i __W, __mmask32 __U, __m512i __A, + unsigned int __B) +{ + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_srli_epi16(__A, __B), + (__v32hi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_srli_epi16(__mmask32 __U, __m512i __A, int __B) +{ + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, + (__v32hi)_mm512_srli_epi16(__A, __B), + (__v32hi)_mm512_setzero_si512()); +} + +#define _mm512_bsrli_epi128(a, imm) \ + ((__m512i)__builtin_ia32_psrldqi512_byteshift((__v8di)(__m512i)(a), (int)(imm))) + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_mov_epi16 (__m512i __W, __mmask32 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_selectw_512 ((__mmask32) __U, + (__v32hi) __A, + (__v32hi) __W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_mov_epi16 (__mmask32 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_selectw_512 ((__mmask32) __U, + (__v32hi) __A, + (__v32hi) _mm512_setzero_si512 ()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_mov_epi8 (__m512i __W, __mmask64 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_selectb_512 ((__mmask64) __U, + (__v64qi) __A, + (__v64qi) __W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_mov_epi8 (__mmask64 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_selectb_512 ((__mmask64) __U, + (__v64qi) __A, + (__v64qi) _mm512_setzero_si512 ()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_set1_epi8 (__m512i __O, __mmask64 __M, char __A) +{ + return (__m512i) __builtin_ia32_selectb_512(__M, + (__v64qi)_mm512_set1_epi8(__A), + (__v64qi) __O); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_set1_epi8 (__mmask64 __M, char __A) +{ + return (__m512i) __builtin_ia32_selectb_512(__M, + (__v64qi) _mm512_set1_epi8(__A), + (__v64qi) _mm512_setzero_si512()); +} + +static __inline__ __mmask64 __DEFAULT_FN_ATTRS +_mm512_kunpackd (__mmask64 __A, __mmask64 __B) +{ + return (__mmask64) __builtin_ia32_kunpckdi ((__mmask64) __A, + (__mmask64) __B); +} + +static __inline__ __mmask32 __DEFAULT_FN_ATTRS +_mm512_kunpackw (__mmask32 __A, __mmask32 __B) +{ + return (__mmask32) __builtin_ia32_kunpcksi ((__mmask32) __A, + (__mmask32) __B); +} + +static __inline __m512i __DEFAULT_FN_ATTRS512 +_mm512_loadu_epi16 (void const *__P) +{ + struct __loadu_epi16 { + __m512i_u __v; + } __attribute__((__packed__, __may_alias__)); + return ((const struct __loadu_epi16*)__P)->__v; +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_loadu_epi16 (__m512i __W, __mmask32 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_loaddquhi512_mask ((const __v32hi *) __P, + (__v32hi) __W, + (__mmask32) __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_loadu_epi16 (__mmask32 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_loaddquhi512_mask ((const __v32hi *) __P, + (__v32hi) + _mm512_setzero_si512 (), + (__mmask32) __U); +} + +static __inline __m512i __DEFAULT_FN_ATTRS512 +_mm512_loadu_epi8 (void const *__P) +{ + struct __loadu_epi8 { + __m512i_u __v; + } __attribute__((__packed__, __may_alias__)); + return ((const struct __loadu_epi8*)__P)->__v; +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_loadu_epi8 (__m512i __W, __mmask64 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_loaddquqi512_mask ((const __v64qi *) __P, + (__v64qi) __W, + (__mmask64) __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_loadu_epi8 (__mmask64 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_loaddquqi512_mask ((const __v64qi *) __P, + (__v64qi) + _mm512_setzero_si512 (), + (__mmask64) __U); +} + +static __inline void __DEFAULT_FN_ATTRS512 +_mm512_storeu_epi16 (void *__P, __m512i __A) +{ + struct __storeu_epi16 { + __m512i_u __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_epi16*)__P)->__v = __A; +} + +static __inline__ void __DEFAULT_FN_ATTRS512 +_mm512_mask_storeu_epi16 (void *__P, __mmask32 __U, __m512i __A) +{ + __builtin_ia32_storedquhi512_mask ((__v32hi *) __P, + (__v32hi) __A, + (__mmask32) __U); +} + +static __inline void __DEFAULT_FN_ATTRS512 +_mm512_storeu_epi8 (void *__P, __m512i __A) +{ + struct __storeu_epi8 { + __m512i_u __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_epi8*)__P)->__v = __A; +} + +static __inline__ void __DEFAULT_FN_ATTRS512 +_mm512_mask_storeu_epi8 (void *__P, __mmask64 __U, __m512i __A) +{ + __builtin_ia32_storedquqi512_mask ((__v64qi *) __P, + (__v64qi) __A, + (__mmask64) __U); +} + +static __inline__ __mmask64 __DEFAULT_FN_ATTRS512 +_mm512_test_epi8_mask (__m512i __A, __m512i __B) +{ + return _mm512_cmpneq_epi8_mask (_mm512_and_epi32 (__A, __B), + _mm512_setzero_si512()); +} + +static __inline__ __mmask64 __DEFAULT_FN_ATTRS512 +_mm512_mask_test_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B) +{ + return _mm512_mask_cmpneq_epi8_mask (__U, _mm512_and_epi32 (__A, __B), + _mm512_setzero_si512()); +} + +static __inline__ __mmask32 __DEFAULT_FN_ATTRS512 +_mm512_test_epi16_mask (__m512i __A, __m512i __B) +{ + return _mm512_cmpneq_epi16_mask (_mm512_and_epi32 (__A, __B), + _mm512_setzero_si512()); +} + +static __inline__ __mmask32 __DEFAULT_FN_ATTRS512 +_mm512_mask_test_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B) +{ + return _mm512_mask_cmpneq_epi16_mask (__U, _mm512_and_epi32 (__A, __B), + _mm512_setzero_si512()); +} + +static __inline__ __mmask64 __DEFAULT_FN_ATTRS512 +_mm512_testn_epi8_mask (__m512i __A, __m512i __B) +{ + return _mm512_cmpeq_epi8_mask (_mm512_and_epi32 (__A, __B), _mm512_setzero_si512()); +} + +static __inline__ __mmask64 __DEFAULT_FN_ATTRS512 +_mm512_mask_testn_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B) +{ + return _mm512_mask_cmpeq_epi8_mask (__U, _mm512_and_epi32 (__A, __B), + _mm512_setzero_si512()); +} + +static __inline__ __mmask32 __DEFAULT_FN_ATTRS512 +_mm512_testn_epi16_mask (__m512i __A, __m512i __B) +{ + return _mm512_cmpeq_epi16_mask (_mm512_and_epi32 (__A, __B), + _mm512_setzero_si512()); +} + +static __inline__ __mmask32 __DEFAULT_FN_ATTRS512 +_mm512_mask_testn_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B) +{ + return _mm512_mask_cmpeq_epi16_mask (__U, _mm512_and_epi32 (__A, __B), + _mm512_setzero_si512()); +} + +static __inline__ __mmask64 __DEFAULT_FN_ATTRS512 +_mm512_movepi8_mask (__m512i __A) +{ + return (__mmask64) __builtin_ia32_cvtb2mask512 ((__v64qi) __A); +} + +static __inline__ __mmask32 __DEFAULT_FN_ATTRS512 +_mm512_movepi16_mask (__m512i __A) +{ + return (__mmask32) __builtin_ia32_cvtw2mask512 ((__v32hi) __A); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_movm_epi8 (__mmask64 __A) +{ + return (__m512i) __builtin_ia32_cvtmask2b512 (__A); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_movm_epi16 (__mmask32 __A) +{ + return (__m512i) __builtin_ia32_cvtmask2w512 (__A); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_broadcastb_epi8 (__m128i __A) +{ + return (__m512i)__builtin_shufflevector((__v16qi) __A, (__v16qi) __A, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_broadcastb_epi8 (__m512i __O, __mmask64 __M, __m128i __A) +{ + return (__m512i)__builtin_ia32_selectb_512(__M, + (__v64qi) _mm512_broadcastb_epi8(__A), + (__v64qi) __O); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_broadcastb_epi8 (__mmask64 __M, __m128i __A) +{ + return (__m512i)__builtin_ia32_selectb_512(__M, + (__v64qi) _mm512_broadcastb_epi8(__A), + (__v64qi) _mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_set1_epi16 (__m512i __O, __mmask32 __M, short __A) +{ + return (__m512i) __builtin_ia32_selectw_512(__M, + (__v32hi) _mm512_set1_epi16(__A), + (__v32hi) __O); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_set1_epi16 (__mmask32 __M, short __A) +{ + return (__m512i) __builtin_ia32_selectw_512(__M, + (__v32hi) _mm512_set1_epi16(__A), + (__v32hi) _mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_broadcastw_epi16 (__m128i __A) +{ + return (__m512i)__builtin_shufflevector((__v8hi) __A, (__v8hi) __A, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_broadcastw_epi16 (__m512i __O, __mmask32 __M, __m128i __A) +{ + return (__m512i)__builtin_ia32_selectw_512(__M, + (__v32hi) _mm512_broadcastw_epi16(__A), + (__v32hi) __O); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_broadcastw_epi16 (__mmask32 __M, __m128i __A) +{ + return (__m512i)__builtin_ia32_selectw_512(__M, + (__v32hi) _mm512_broadcastw_epi16(__A), + (__v32hi) _mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_permutexvar_epi16 (__m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_permutexvar_epi16 (__mmask32 __M, __m512i __A, + __m512i __B) +{ + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, + (__v32hi)_mm512_permutexvar_epi16(__A, __B), + (__v32hi)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_permutexvar_epi16 (__m512i __W, __mmask32 __M, __m512i __A, + __m512i __B) +{ + return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, + (__v32hi)_mm512_permutexvar_epi16(__A, __B), + (__v32hi)__W); +} + +#define _mm512_alignr_epi8(A, B, N) \ + ((__m512i)__builtin_ia32_palignr512((__v64qi)(__m512i)(A), \ + (__v64qi)(__m512i)(B), (int)(N))) + +#define _mm512_mask_alignr_epi8(W, U, A, B, N) \ + ((__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \ + (__v64qi)_mm512_alignr_epi8((A), (B), (int)(N)), \ + (__v64qi)(__m512i)(W))) + +#define _mm512_maskz_alignr_epi8(U, A, B, N) \ + ((__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \ + (__v64qi)_mm512_alignr_epi8((A), (B), (int)(N)), \ + (__v64qi)(__m512i)_mm512_setzero_si512())) + +#define _mm512_dbsad_epu8(A, B, imm) \ + ((__m512i)__builtin_ia32_dbpsadbw512((__v64qi)(__m512i)(A), \ + (__v64qi)(__m512i)(B), (int)(imm))) + +#define _mm512_mask_dbsad_epu8(W, U, A, B, imm) \ + ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ + (__v32hi)_mm512_dbsad_epu8((A), (B), (imm)), \ + (__v32hi)(__m512i)(W))) + +#define _mm512_maskz_dbsad_epu8(U, A, B, imm) \ + ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ + (__v32hi)_mm512_dbsad_epu8((A), (B), (imm)), \ + (__v32hi)_mm512_setzero_si512())) + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_sad_epu8 (__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_psadbw512 ((__v64qi) __A, + (__v64qi) __B); +} + +#undef __DEFAULT_FN_ATTRS512 +#undef __DEFAULT_FN_ATTRS + +#endif diff --git a/include-llvm/avx512cdintrin.h b/include-llvm/avx512cdintrin.h new file mode 100644 index 0000000..bfdba84 --- /dev/null +++ b/include-llvm/avx512cdintrin.h @@ -0,0 +1,123 @@ +/*===------------- avx512cdintrin.h - AVX512CD intrinsics ------------------=== + * + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __AVX512CDINTRIN_H +#define __AVX512CDINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512cd"), __min_vector_width__(512))) + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_conflict_epi64 (__m512i __A) +{ + return (__m512i) __builtin_ia32_vpconflictdi_512 ((__v8di) __A); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_conflict_epi64 (__m512i __W, __mmask8 __U, __m512i __A) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_conflict_epi64(__A), + (__v8di)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_conflict_epi64 (__mmask8 __U, __m512i __A) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_conflict_epi64(__A), + (__v8di)_mm512_setzero_si512 ()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_conflict_epi32 (__m512i __A) +{ + return (__m512i) __builtin_ia32_vpconflictsi_512 ((__v16si) __A); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_conflict_epi32 (__m512i __W, __mmask16 __U, __m512i __A) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_conflict_epi32(__A), + (__v16si)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_conflict_epi32 (__mmask16 __U, __m512i __A) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_conflict_epi32(__A), + (__v16si)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_lzcnt_epi32 (__m512i __A) +{ + return (__m512i) __builtin_ia32_vplzcntd_512 ((__v16si) __A); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_lzcnt_epi32 (__m512i __W, __mmask16 __U, __m512i __A) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_lzcnt_epi32(__A), + (__v16si)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_lzcnt_epi32 (__mmask16 __U, __m512i __A) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_lzcnt_epi32(__A), + (__v16si)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_lzcnt_epi64 (__m512i __A) +{ + return (__m512i) __builtin_ia32_vplzcntq_512 ((__v8di) __A); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_lzcnt_epi64 (__m512i __W, __mmask8 __U, __m512i __A) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_lzcnt_epi64(__A), + (__v8di)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_lzcnt_epi64 (__mmask8 __U, __m512i __A) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_lzcnt_epi64(__A), + (__v8di)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_broadcastmb_epi64 (__mmask8 __A) +{ + return (__m512i) _mm512_set1_epi64((long long) __A); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_broadcastmw_epi32 (__mmask16 __A) +{ + return (__m512i) _mm512_set1_epi32((int) __A); + +} + +#undef __DEFAULT_FN_ATTRS + +#endif diff --git a/include-llvm/avx512dqintrin.h b/include-llvm/avx512dqintrin.h new file mode 100644 index 0000000..3ba0a0c --- /dev/null +++ b/include-llvm/avx512dqintrin.h @@ -0,0 +1,1377 @@ +/*===---- avx512dqintrin.h - AVX512DQ intrinsics ---------------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __AVX512DQINTRIN_H +#define __AVX512DQINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS512 __attribute__((__always_inline__, __nodebug__, __target__("avx512dq"), __min_vector_width__(512))) +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512dq"))) + +static __inline __mmask8 __DEFAULT_FN_ATTRS +_knot_mask8(__mmask8 __M) +{ + return __builtin_ia32_knotqi(__M); +} + +static __inline__ __mmask8 __DEFAULT_FN_ATTRS +_kand_mask8(__mmask8 __A, __mmask8 __B) +{ + return (__mmask8)__builtin_ia32_kandqi((__mmask8)__A, (__mmask8)__B); +} + +static __inline__ __mmask8 __DEFAULT_FN_ATTRS +_kandn_mask8(__mmask8 __A, __mmask8 __B) +{ + return (__mmask8)__builtin_ia32_kandnqi((__mmask8)__A, (__mmask8)__B); +} + +static __inline__ __mmask8 __DEFAULT_FN_ATTRS +_kor_mask8(__mmask8 __A, __mmask8 __B) +{ + return (__mmask8)__builtin_ia32_korqi((__mmask8)__A, (__mmask8)__B); +} + +static __inline__ __mmask8 __DEFAULT_FN_ATTRS +_kxnor_mask8(__mmask8 __A, __mmask8 __B) +{ + return (__mmask8)__builtin_ia32_kxnorqi((__mmask8)__A, (__mmask8)__B); +} + +static __inline__ __mmask8 __DEFAULT_FN_ATTRS +_kxor_mask8(__mmask8 __A, __mmask8 __B) +{ + return (__mmask8)__builtin_ia32_kxorqi((__mmask8)__A, (__mmask8)__B); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_kortestc_mask8_u8(__mmask8 __A, __mmask8 __B) +{ + return (unsigned char)__builtin_ia32_kortestcqi(__A, __B); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_kortestz_mask8_u8(__mmask8 __A, __mmask8 __B) +{ + return (unsigned char)__builtin_ia32_kortestzqi(__A, __B); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_kortest_mask8_u8(__mmask8 __A, __mmask8 __B, unsigned char *__C) { + *__C = (unsigned char)__builtin_ia32_kortestcqi(__A, __B); + return (unsigned char)__builtin_ia32_kortestzqi(__A, __B); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_ktestc_mask8_u8(__mmask8 __A, __mmask8 __B) +{ + return (unsigned char)__builtin_ia32_ktestcqi(__A, __B); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_ktestz_mask8_u8(__mmask8 __A, __mmask8 __B) +{ + return (unsigned char)__builtin_ia32_ktestzqi(__A, __B); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_ktest_mask8_u8(__mmask8 __A, __mmask8 __B, unsigned char *__C) { + *__C = (unsigned char)__builtin_ia32_ktestcqi(__A, __B); + return (unsigned char)__builtin_ia32_ktestzqi(__A, __B); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_ktestc_mask16_u8(__mmask16 __A, __mmask16 __B) +{ + return (unsigned char)__builtin_ia32_ktestchi(__A, __B); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_ktestz_mask16_u8(__mmask16 __A, __mmask16 __B) +{ + return (unsigned char)__builtin_ia32_ktestzhi(__A, __B); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_ktest_mask16_u8(__mmask16 __A, __mmask16 __B, unsigned char *__C) { + *__C = (unsigned char)__builtin_ia32_ktestchi(__A, __B); + return (unsigned char)__builtin_ia32_ktestzhi(__A, __B); +} + +static __inline__ __mmask8 __DEFAULT_FN_ATTRS +_kadd_mask8(__mmask8 __A, __mmask8 __B) +{ + return (__mmask8)__builtin_ia32_kaddqi((__mmask8)__A, (__mmask8)__B); +} + +static __inline__ __mmask16 __DEFAULT_FN_ATTRS +_kadd_mask16(__mmask16 __A, __mmask16 __B) +{ + return (__mmask16)__builtin_ia32_kaddhi((__mmask16)__A, (__mmask16)__B); +} + +#define _kshiftli_mask8(A, I) \ + ((__mmask8)__builtin_ia32_kshiftliqi((__mmask8)(A), (unsigned int)(I))) + +#define _kshiftri_mask8(A, I) \ + ((__mmask8)__builtin_ia32_kshiftriqi((__mmask8)(A), (unsigned int)(I))) + +static __inline__ unsigned int __DEFAULT_FN_ATTRS +_cvtmask8_u32(__mmask8 __A) { + return (unsigned int)__builtin_ia32_kmovb((__mmask8)__A); +} + +static __inline__ __mmask8 __DEFAULT_FN_ATTRS +_cvtu32_mask8(unsigned int __A) { + return (__mmask8)__builtin_ia32_kmovb((__mmask8)__A); +} + +static __inline__ __mmask8 __DEFAULT_FN_ATTRS +_load_mask8(__mmask8 *__A) { + return (__mmask8)__builtin_ia32_kmovb(*(__mmask8 *)__A); +} + +static __inline__ void __DEFAULT_FN_ATTRS +_store_mask8(__mmask8 *__A, __mmask8 __B) { + *(__mmask8 *)__A = __builtin_ia32_kmovb((__mmask8)__B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mullo_epi64 (__m512i __A, __m512i __B) { + return (__m512i) ((__v8du) __A * (__v8du) __B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_mullo_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_mullo_epi64(__A, __B), + (__v8di)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_mullo_epi64(__mmask8 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_mullo_epi64(__A, __B), + (__v8di)_mm512_setzero_si512()); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_xor_pd(__m512d __A, __m512d __B) { + return (__m512d)((__v8du)__A ^ (__v8du)__B); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_mask_xor_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_xor_pd(__A, __B), + (__v8df)__W); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_maskz_xor_pd(__mmask8 __U, __m512d __A, __m512d __B) { + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_xor_pd(__A, __B), + (__v8df)_mm512_setzero_pd()); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_xor_ps (__m512 __A, __m512 __B) { + return (__m512)((__v16su)__A ^ (__v16su)__B); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask_xor_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_xor_ps(__A, __B), + (__v16sf)__W); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_maskz_xor_ps(__mmask16 __U, __m512 __A, __m512 __B) { + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_xor_ps(__A, __B), + (__v16sf)_mm512_setzero_ps()); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_or_pd(__m512d __A, __m512d __B) { + return (__m512d)((__v8du)__A | (__v8du)__B); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_mask_or_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_or_pd(__A, __B), + (__v8df)__W); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_maskz_or_pd(__mmask8 __U, __m512d __A, __m512d __B) { + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_or_pd(__A, __B), + (__v8df)_mm512_setzero_pd()); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_or_ps(__m512 __A, __m512 __B) { + return (__m512)((__v16su)__A | (__v16su)__B); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask_or_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_or_ps(__A, __B), + (__v16sf)__W); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_maskz_or_ps(__mmask16 __U, __m512 __A, __m512 __B) { + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_or_ps(__A, __B), + (__v16sf)_mm512_setzero_ps()); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_and_pd(__m512d __A, __m512d __B) { + return (__m512d)((__v8du)__A & (__v8du)__B); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_mask_and_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_and_pd(__A, __B), + (__v8df)__W); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_maskz_and_pd(__mmask8 __U, __m512d __A, __m512d __B) { + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_and_pd(__A, __B), + (__v8df)_mm512_setzero_pd()); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_and_ps(__m512 __A, __m512 __B) { + return (__m512)((__v16su)__A & (__v16su)__B); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask_and_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_and_ps(__A, __B), + (__v16sf)__W); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_maskz_and_ps(__mmask16 __U, __m512 __A, __m512 __B) { + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_and_ps(__A, __B), + (__v16sf)_mm512_setzero_ps()); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_andnot_pd(__m512d __A, __m512d __B) { + return (__m512d)(~(__v8du)__A & (__v8du)__B); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_mask_andnot_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_andnot_pd(__A, __B), + (__v8df)__W); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_maskz_andnot_pd(__mmask8 __U, __m512d __A, __m512d __B) { + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_andnot_pd(__A, __B), + (__v8df)_mm512_setzero_pd()); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_andnot_ps(__m512 __A, __m512 __B) { + return (__m512)(~(__v16su)__A & (__v16su)__B); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask_andnot_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_andnot_ps(__A, __B), + (__v16sf)__W); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_maskz_andnot_ps(__mmask16 __U, __m512 __A, __m512 __B) { + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_andnot_ps(__A, __B), + (__v16sf)_mm512_setzero_ps()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_cvtpd_epi64 (__m512d __A) { + return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A, + (__v8di) _mm512_setzero_si512(), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtpd_epi64 (__m512i __W, __mmask8 __U, __m512d __A) { + return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A, + (__v8di) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtpd_epi64 (__mmask8 __U, __m512d __A) { + return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A, + (__v8di) _mm512_setzero_si512(), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_cvt_roundpd_epi64(A, R) \ + ((__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \ + (__v8di)_mm512_setzero_si512(), \ + (__mmask8)-1, (int)(R))) + +#define _mm512_mask_cvt_roundpd_epi64(W, U, A, R) \ + ((__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \ + (__v8di)(__m512i)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm512_maskz_cvt_roundpd_epi64(U, A, R) \ + ((__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \ + (__v8di)_mm512_setzero_si512(), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_cvtpd_epu64 (__m512d __A) { + return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A, + (__v8di) _mm512_setzero_si512(), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtpd_epu64 (__m512i __W, __mmask8 __U, __m512d __A) { + return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A, + (__v8di) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtpd_epu64 (__mmask8 __U, __m512d __A) { + return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A, + (__v8di) _mm512_setzero_si512(), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_cvt_roundpd_epu64(A, R) \ + ((__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \ + (__v8di)_mm512_setzero_si512(), \ + (__mmask8)-1, (int)(R))) + +#define _mm512_mask_cvt_roundpd_epu64(W, U, A, R) \ + ((__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \ + (__v8di)(__m512i)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm512_maskz_cvt_roundpd_epu64(U, A, R) \ + ((__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \ + (__v8di)_mm512_setzero_si512(), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_cvtps_epi64 (__m256 __A) { + return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A, + (__v8di) _mm512_setzero_si512(), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtps_epi64 (__m512i __W, __mmask8 __U, __m256 __A) { + return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A, + (__v8di) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtps_epi64 (__mmask8 __U, __m256 __A) { + return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A, + (__v8di) _mm512_setzero_si512(), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_cvt_roundps_epi64(A, R) \ + ((__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \ + (__v8di)_mm512_setzero_si512(), \ + (__mmask8)-1, (int)(R))) + +#define _mm512_mask_cvt_roundps_epi64(W, U, A, R) \ + ((__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \ + (__v8di)(__m512i)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm512_maskz_cvt_roundps_epi64(U, A, R) \ + ((__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \ + (__v8di)_mm512_setzero_si512(), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_cvtps_epu64 (__m256 __A) { + return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A, + (__v8di) _mm512_setzero_si512(), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtps_epu64 (__m512i __W, __mmask8 __U, __m256 __A) { + return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A, + (__v8di) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtps_epu64 (__mmask8 __U, __m256 __A) { + return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A, + (__v8di) _mm512_setzero_si512(), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_cvt_roundps_epu64(A, R) \ + ((__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \ + (__v8di)_mm512_setzero_si512(), \ + (__mmask8)-1, (int)(R))) + +#define _mm512_mask_cvt_roundps_epu64(W, U, A, R) \ + ((__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \ + (__v8di)(__m512i)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm512_maskz_cvt_roundps_epu64(U, A, R) \ + ((__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \ + (__v8di)_mm512_setzero_si512(), \ + (__mmask8)(U), (int)(R))) + + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_cvtepi64_pd (__m512i __A) { + return (__m512d)__builtin_convertvector((__v8di)__A, __v8df); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtepi64_pd (__m512d __W, __mmask8 __U, __m512i __A) { + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_cvtepi64_pd(__A), + (__v8df)__W); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtepi64_pd (__mmask8 __U, __m512i __A) { + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_cvtepi64_pd(__A), + (__v8df)_mm512_setzero_pd()); +} + +#define _mm512_cvt_roundepi64_pd(A, R) \ + ((__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)-1, (int)(R))) + +#define _mm512_mask_cvt_roundepi64_pd(W, U, A, R) \ + ((__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \ + (__v8df)(__m512d)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm512_maskz_cvt_roundepi64_pd(U, A, R) \ + ((__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m256 __DEFAULT_FN_ATTRS512 +_mm512_cvtepi64_ps (__m512i __A) { + return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A, + (__v8sf) _mm256_setzero_ps(), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtepi64_ps (__m256 __W, __mmask8 __U, __m512i __A) { + return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A, + (__v8sf) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtepi64_ps (__mmask8 __U, __m512i __A) { + return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A, + (__v8sf) _mm256_setzero_ps(), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_cvt_roundepi64_ps(A, R) \ + ((__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \ + (__v8sf)_mm256_setzero_ps(), \ + (__mmask8)-1, (int)(R))) + +#define _mm512_mask_cvt_roundepi64_ps(W, U, A, R) \ + ((__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \ + (__v8sf)(__m256)(W), (__mmask8)(U), \ + (int)(R))) + +#define _mm512_maskz_cvt_roundepi64_ps(U, A, R) \ + ((__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \ + (__v8sf)_mm256_setzero_ps(), \ + (__mmask8)(U), (int)(R))) + + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_cvttpd_epi64 (__m512d __A) { + return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A, + (__v8di) _mm512_setzero_si512(), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvttpd_epi64 (__m512i __W, __mmask8 __U, __m512d __A) { + return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A, + (__v8di) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvttpd_epi64 (__mmask8 __U, __m512d __A) { + return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A, + (__v8di) _mm512_setzero_si512(), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_cvtt_roundpd_epi64(A, R) \ + ((__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \ + (__v8di)_mm512_setzero_si512(), \ + (__mmask8)-1, (int)(R))) + +#define _mm512_mask_cvtt_roundpd_epi64(W, U, A, R) \ + ((__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \ + (__v8di)(__m512i)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm512_maskz_cvtt_roundpd_epi64(U, A, R) \ + ((__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \ + (__v8di)_mm512_setzero_si512(), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_cvttpd_epu64 (__m512d __A) { + return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A, + (__v8di) _mm512_setzero_si512(), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvttpd_epu64 (__m512i __W, __mmask8 __U, __m512d __A) { + return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A, + (__v8di) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvttpd_epu64 (__mmask8 __U, __m512d __A) { + return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A, + (__v8di) _mm512_setzero_si512(), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_cvtt_roundpd_epu64(A, R) \ + ((__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \ + (__v8di)_mm512_setzero_si512(), \ + (__mmask8)-1, (int)(R))) + +#define _mm512_mask_cvtt_roundpd_epu64(W, U, A, R) \ + ((__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \ + (__v8di)(__m512i)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm512_maskz_cvtt_roundpd_epu64(U, A, R) \ + ((__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \ + (__v8di)_mm512_setzero_si512(), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_cvttps_epi64 (__m256 __A) { + return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A, + (__v8di) _mm512_setzero_si512(), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvttps_epi64 (__m512i __W, __mmask8 __U, __m256 __A) { + return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A, + (__v8di) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvttps_epi64 (__mmask8 __U, __m256 __A) { + return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A, + (__v8di) _mm512_setzero_si512(), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_cvtt_roundps_epi64(A, R) \ + ((__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \ + (__v8di)_mm512_setzero_si512(), \ + (__mmask8)-1, (int)(R))) + +#define _mm512_mask_cvtt_roundps_epi64(W, U, A, R) \ + ((__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \ + (__v8di)(__m512i)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm512_maskz_cvtt_roundps_epi64(U, A, R) \ + ((__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \ + (__v8di)_mm512_setzero_si512(), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_cvttps_epu64 (__m256 __A) { + return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A, + (__v8di) _mm512_setzero_si512(), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvttps_epu64 (__m512i __W, __mmask8 __U, __m256 __A) { + return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A, + (__v8di) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvttps_epu64 (__mmask8 __U, __m256 __A) { + return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A, + (__v8di) _mm512_setzero_si512(), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_cvtt_roundps_epu64(A, R) \ + ((__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \ + (__v8di)_mm512_setzero_si512(), \ + (__mmask8)-1, (int)(R))) + +#define _mm512_mask_cvtt_roundps_epu64(W, U, A, R) \ + ((__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \ + (__v8di)(__m512i)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm512_maskz_cvtt_roundps_epu64(U, A, R) \ + ((__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \ + (__v8di)_mm512_setzero_si512(), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_cvtepu64_pd (__m512i __A) { + return (__m512d)__builtin_convertvector((__v8du)__A, __v8df); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtepu64_pd (__m512d __W, __mmask8 __U, __m512i __A) { + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_cvtepu64_pd(__A), + (__v8df)__W); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtepu64_pd (__mmask8 __U, __m512i __A) { + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_cvtepu64_pd(__A), + (__v8df)_mm512_setzero_pd()); +} + +#define _mm512_cvt_roundepu64_pd(A, R) \ + ((__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)-1, (int)(R))) + +#define _mm512_mask_cvt_roundepu64_pd(W, U, A, R) \ + ((__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \ + (__v8df)(__m512d)(W), \ + (__mmask8)(U), (int)(R))) + + +#define _mm512_maskz_cvt_roundepu64_pd(U, A, R) \ + ((__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)(U), (int)(R))) + + +static __inline__ __m256 __DEFAULT_FN_ATTRS512 +_mm512_cvtepu64_ps (__m512i __A) { + return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A, + (__v8sf) _mm256_setzero_ps(), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtepu64_ps (__m256 __W, __mmask8 __U, __m512i __A) { + return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A, + (__v8sf) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtepu64_ps (__mmask8 __U, __m512i __A) { + return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A, + (__v8sf) _mm256_setzero_ps(), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_cvt_roundepu64_ps(A, R) \ + ((__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \ + (__v8sf)_mm256_setzero_ps(), \ + (__mmask8)-1, (int)(R))) + +#define _mm512_mask_cvt_roundepu64_ps(W, U, A, R) \ + ((__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \ + (__v8sf)(__m256)(W), (__mmask8)(U), \ + (int)(R))) + +#define _mm512_maskz_cvt_roundepu64_ps(U, A, R) \ + ((__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \ + (__v8sf)_mm256_setzero_ps(), \ + (__mmask8)(U), (int)(R))) + +#define _mm512_range_pd(A, B, C) \ + ((__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), (int)(C), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_range_pd(W, U, A, B, C) \ + ((__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), (int)(C), \ + (__v8df)(__m512d)(W), (__mmask8)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_maskz_range_pd(U, A, B, C) \ + ((__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), (int)(C), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_range_round_pd(A, B, C, R) \ + ((__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), (int)(C), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)-1, (int)(R))) + +#define _mm512_mask_range_round_pd(W, U, A, B, C, R) \ + ((__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), (int)(C), \ + (__v8df)(__m512d)(W), (__mmask8)(U), \ + (int)(R))) + +#define _mm512_maskz_range_round_pd(U, A, B, C, R) \ + ((__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), (int)(C), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)(U), (int)(R))) + +#define _mm512_range_ps(A, B, C) \ + ((__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), (int)(C), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_range_ps(W, U, A, B, C) \ + ((__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), (int)(C), \ + (__v16sf)(__m512)(W), (__mmask16)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_maskz_range_ps(U, A, B, C) \ + ((__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), (int)(C), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_range_round_ps(A, B, C, R) \ + ((__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), (int)(C), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)-1, (int)(R))) + +#define _mm512_mask_range_round_ps(W, U, A, B, C, R) \ + ((__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), (int)(C), \ + (__v16sf)(__m512)(W), (__mmask16)(U), \ + (int)(R))) + +#define _mm512_maskz_range_round_ps(U, A, B, C, R) \ + ((__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), (int)(C), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)(U), (int)(R))) + +#define _mm_range_round_ss(A, B, C, R) \ + ((__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8) -1, (int)(C),\ + (int)(R))) + +#define _mm_range_ss(A ,B , C) _mm_range_round_ss(A, B, C ,_MM_FROUND_CUR_DIRECTION) + +#define _mm_mask_range_round_ss(W, U, A, B, C, R) \ + ((__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)(__m128)(W),\ + (__mmask8)(U), (int)(C),\ + (int)(R))) + +#define _mm_mask_range_ss(W , U, A, B, C) _mm_mask_range_round_ss(W, U, A, B, C , _MM_FROUND_CUR_DIRECTION) + +#define _mm_maskz_range_round_ss(U, A, B, C, R) \ + ((__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(U), (int)(C),\ + (int)(R))) + +#define _mm_maskz_range_ss(U, A ,B , C) _mm_maskz_range_round_ss(U, A, B, C ,_MM_FROUND_CUR_DIRECTION) + +#define _mm_range_round_sd(A, B, C, R) \ + ((__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8) -1, (int)(C),\ + (int)(R))) + +#define _mm_range_sd(A ,B , C) _mm_range_round_sd(A, B, C ,_MM_FROUND_CUR_DIRECTION) + +#define _mm_mask_range_round_sd(W, U, A, B, C, R) \ + ((__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)(__m128d)(W),\ + (__mmask8)(U), (int)(C),\ + (int)(R))) + +#define _mm_mask_range_sd(W, U, A, B, C) _mm_mask_range_round_sd(W, U, A, B, C ,_MM_FROUND_CUR_DIRECTION) + +#define _mm_maskz_range_round_sd(U, A, B, C, R) \ + ((__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(U), (int)(C),\ + (int)(R))) + +#define _mm_maskz_range_sd(U, A, B, C) _mm_maskz_range_round_sd(U, A, B, C ,_MM_FROUND_CUR_DIRECTION) + +#define _mm512_reduce_pd(A, B) \ + ((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_reduce_pd(W, U, A, B) \ + ((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \ + (__v8df)(__m512d)(W), \ + (__mmask8)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_maskz_reduce_pd(U, A, B) \ + ((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_reduce_ps(A, B) \ + ((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_reduce_ps(W, U, A, B) \ + ((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \ + (__v16sf)(__m512)(W), \ + (__mmask16)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_maskz_reduce_ps(U, A, B) \ + ((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_reduce_round_pd(A, B, R) \ + ((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)-1, (int)(R))) + +#define _mm512_mask_reduce_round_pd(W, U, A, B, R) \ + ((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \ + (__v8df)(__m512d)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm512_maskz_reduce_round_pd(U, A, B, R) \ + ((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)(U), (int)(R))) + +#define _mm512_reduce_round_ps(A, B, R) \ + ((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)-1, (int)(R))) + +#define _mm512_mask_reduce_round_ps(W, U, A, B, R) \ + ((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \ + (__v16sf)(__m512)(W), \ + (__mmask16)(U), (int)(R))) + +#define _mm512_maskz_reduce_round_ps(U, A, B, R) \ + ((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)(U), (int)(R))) + +#define _mm_reduce_ss(A, B, C) \ + ((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), (__mmask8)-1, \ + (int)(C), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_reduce_ss(W, U, A, B, C) \ + ((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)(__m128)(W), (__mmask8)(U), \ + (int)(C), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_maskz_reduce_ss(U, A, B, C) \ + ((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(U), (int)(C), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_reduce_round_ss(A, B, C, R) \ + ((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), (__mmask8)-1, \ + (int)(C), (int)(R))) + +#define _mm_mask_reduce_round_ss(W, U, A, B, C, R) \ + ((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)(__m128)(W), (__mmask8)(U), \ + (int)(C), (int)(R))) + +#define _mm_maskz_reduce_round_ss(U, A, B, C, R) \ + ((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(U), (int)(C), (int)(R))) + +#define _mm_reduce_sd(A, B, C) \ + ((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)-1, (int)(C), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_reduce_sd(W, U, A, B, C) \ + ((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)(__m128d)(W), (__mmask8)(U), \ + (int)(C), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_maskz_reduce_sd(U, A, B, C) \ + ((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(U), (int)(C), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_reduce_round_sd(A, B, C, R) \ + ((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)-1, (int)(C), (int)(R))) + +#define _mm_mask_reduce_round_sd(W, U, A, B, C, R) \ + ((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)(__m128d)(W), (__mmask8)(U), \ + (int)(C), (int)(R))) + +#define _mm_maskz_reduce_round_sd(U, A, B, C, R) \ + ((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(U), (int)(C), (int)(R))) + +static __inline__ __mmask16 __DEFAULT_FN_ATTRS512 +_mm512_movepi32_mask (__m512i __A) +{ + return (__mmask16) __builtin_ia32_cvtd2mask512 ((__v16si) __A); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_movm_epi32 (__mmask16 __A) +{ + return (__m512i) __builtin_ia32_cvtmask2d512 (__A); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_movm_epi64 (__mmask8 __A) +{ + return (__m512i) __builtin_ia32_cvtmask2q512 (__A); +} + +static __inline__ __mmask8 __DEFAULT_FN_ATTRS512 +_mm512_movepi64_mask (__m512i __A) +{ + return (__mmask8) __builtin_ia32_cvtq2mask512 ((__v8di) __A); +} + + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_broadcast_f32x2 (__m128 __A) +{ + return (__m512)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A, + 0, 1, 0, 1, 0, 1, 0, 1, + 0, 1, 0, 1, 0, 1, 0, 1); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask_broadcast_f32x2 (__m512 __O, __mmask16 __M, __m128 __A) +{ + return (__m512)__builtin_ia32_selectps_512((__mmask16)__M, + (__v16sf)_mm512_broadcast_f32x2(__A), + (__v16sf)__O); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_maskz_broadcast_f32x2 (__mmask16 __M, __m128 __A) +{ + return (__m512)__builtin_ia32_selectps_512((__mmask16)__M, + (__v16sf)_mm512_broadcast_f32x2(__A), + (__v16sf)_mm512_setzero_ps()); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_broadcast_f32x8(__m256 __A) +{ + return (__m512)__builtin_shufflevector((__v8sf)__A, (__v8sf)__A, + 0, 1, 2, 3, 4, 5, 6, 7, + 0, 1, 2, 3, 4, 5, 6, 7); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask_broadcast_f32x8(__m512 __O, __mmask16 __M, __m256 __A) +{ + return (__m512)__builtin_ia32_selectps_512((__mmask16)__M, + (__v16sf)_mm512_broadcast_f32x8(__A), + (__v16sf)__O); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_maskz_broadcast_f32x8(__mmask16 __M, __m256 __A) +{ + return (__m512)__builtin_ia32_selectps_512((__mmask16)__M, + (__v16sf)_mm512_broadcast_f32x8(__A), + (__v16sf)_mm512_setzero_ps()); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_broadcast_f64x2(__m128d __A) +{ + return (__m512d)__builtin_shufflevector((__v2df)__A, (__v2df)__A, + 0, 1, 0, 1, 0, 1, 0, 1); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_mask_broadcast_f64x2(__m512d __O, __mmask8 __M, __m128d __A) +{ + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M, + (__v8df)_mm512_broadcast_f64x2(__A), + (__v8df)__O); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_maskz_broadcast_f64x2(__mmask8 __M, __m128d __A) +{ + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M, + (__v8df)_mm512_broadcast_f64x2(__A), + (__v8df)_mm512_setzero_pd()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_broadcast_i32x2 (__m128i __A) +{ + return (__m512i)__builtin_shufflevector((__v4si)__A, (__v4si)__A, + 0, 1, 0, 1, 0, 1, 0, 1, + 0, 1, 0, 1, 0, 1, 0, 1); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_broadcast_i32x2 (__m512i __O, __mmask16 __M, __m128i __A) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, + (__v16si)_mm512_broadcast_i32x2(__A), + (__v16si)__O); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_broadcast_i32x2 (__mmask16 __M, __m128i __A) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, + (__v16si)_mm512_broadcast_i32x2(__A), + (__v16si)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_broadcast_i32x8(__m256i __A) +{ + return (__m512i)__builtin_shufflevector((__v8si)__A, (__v8si)__A, + 0, 1, 2, 3, 4, 5, 6, 7, + 0, 1, 2, 3, 4, 5, 6, 7); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_broadcast_i32x8(__m512i __O, __mmask16 __M, __m256i __A) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, + (__v16si)_mm512_broadcast_i32x8(__A), + (__v16si)__O); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_broadcast_i32x8(__mmask16 __M, __m256i __A) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, + (__v16si)_mm512_broadcast_i32x8(__A), + (__v16si)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_broadcast_i64x2(__m128i __A) +{ + return (__m512i)__builtin_shufflevector((__v2di)__A, (__v2di)__A, + 0, 1, 0, 1, 0, 1, 0, 1); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_broadcast_i64x2(__m512i __O, __mmask8 __M, __m128i __A) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, + (__v8di)_mm512_broadcast_i64x2(__A), + (__v8di)__O); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, + (__v8di)_mm512_broadcast_i64x2(__A), + (__v8di)_mm512_setzero_si512()); +} + +#define _mm512_extractf32x8_ps(A, imm) \ + ((__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \ + (__v8sf)_mm256_undefined_ps(), \ + (__mmask8)-1)) + +#define _mm512_mask_extractf32x8_ps(W, U, A, imm) \ + ((__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \ + (__v8sf)(__m256)(W), \ + (__mmask8)(U))) + +#define _mm512_maskz_extractf32x8_ps(U, A, imm) \ + ((__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \ + (__v8sf)_mm256_setzero_ps(), \ + (__mmask8)(U))) + +#define _mm512_extractf64x2_pd(A, imm) \ + ((__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \ + (int)(imm), \ + (__v2df)_mm_undefined_pd(), \ + (__mmask8)-1)) + +#define _mm512_mask_extractf64x2_pd(W, U, A, imm) \ + ((__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \ + (int)(imm), \ + (__v2df)(__m128d)(W), \ + (__mmask8)(U))) + +#define _mm512_maskz_extractf64x2_pd(U, A, imm) \ + ((__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \ + (int)(imm), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(U))) + +#define _mm512_extracti32x8_epi32(A, imm) \ + ((__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \ + (__v8si)_mm256_undefined_si256(), \ + (__mmask8)-1)) + +#define _mm512_mask_extracti32x8_epi32(W, U, A, imm) \ + ((__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \ + (__v8si)(__m256i)(W), \ + (__mmask8)(U))) + +#define _mm512_maskz_extracti32x8_epi32(U, A, imm) \ + ((__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \ + (__v8si)_mm256_setzero_si256(), \ + (__mmask8)(U))) + +#define _mm512_extracti64x2_epi64(A, imm) \ + ((__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \ + (int)(imm), \ + (__v2di)_mm_undefined_si128(), \ + (__mmask8)-1)) + +#define _mm512_mask_extracti64x2_epi64(W, U, A, imm) \ + ((__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \ + (int)(imm), \ + (__v2di)(__m128i)(W), \ + (__mmask8)(U))) + +#define _mm512_maskz_extracti64x2_epi64(U, A, imm) \ + ((__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \ + (int)(imm), \ + (__v2di)_mm_setzero_si128(), \ + (__mmask8)(U))) + +#define _mm512_insertf32x8(A, B, imm) \ + ((__m512)__builtin_ia32_insertf32x8((__v16sf)(__m512)(A), \ + (__v8sf)(__m256)(B), (int)(imm))) + +#define _mm512_mask_insertf32x8(W, U, A, B, imm) \ + ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + (__v16sf)_mm512_insertf32x8((A), (B), (imm)), \ + (__v16sf)(__m512)(W))) + +#define _mm512_maskz_insertf32x8(U, A, B, imm) \ + ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + (__v16sf)_mm512_insertf32x8((A), (B), (imm)), \ + (__v16sf)_mm512_setzero_ps())) + +#define _mm512_insertf64x2(A, B, imm) \ + ((__m512d)__builtin_ia32_insertf64x2_512((__v8df)(__m512d)(A), \ + (__v2df)(__m128d)(B), (int)(imm))) + +#define _mm512_mask_insertf64x2(W, U, A, B, imm) \ + ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__v8df)_mm512_insertf64x2((A), (B), (imm)), \ + (__v8df)(__m512d)(W))) + +#define _mm512_maskz_insertf64x2(U, A, B, imm) \ + ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__v8df)_mm512_insertf64x2((A), (B), (imm)), \ + (__v8df)_mm512_setzero_pd())) + +#define _mm512_inserti32x8(A, B, imm) \ + ((__m512i)__builtin_ia32_inserti32x8((__v16si)(__m512i)(A), \ + (__v8si)(__m256i)(B), (int)(imm))) + +#define _mm512_mask_inserti32x8(W, U, A, B, imm) \ + ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ + (__v16si)_mm512_inserti32x8((A), (B), (imm)), \ + (__v16si)(__m512i)(W))) + +#define _mm512_maskz_inserti32x8(U, A, B, imm) \ + ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ + (__v16si)_mm512_inserti32x8((A), (B), (imm)), \ + (__v16si)_mm512_setzero_si512())) + +#define _mm512_inserti64x2(A, B, imm) \ + ((__m512i)__builtin_ia32_inserti64x2_512((__v8di)(__m512i)(A), \ + (__v2di)(__m128i)(B), (int)(imm))) + +#define _mm512_mask_inserti64x2(W, U, A, B, imm) \ + ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ + (__v8di)_mm512_inserti64x2((A), (B), (imm)), \ + (__v8di)(__m512i)(W))) + +#define _mm512_maskz_inserti64x2(U, A, B, imm) \ + ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ + (__v8di)_mm512_inserti64x2((A), (B), (imm)), \ + (__v8di)_mm512_setzero_si512())) + +#define _mm512_mask_fpclass_ps_mask(U, A, imm) \ + ((__mmask16)__builtin_ia32_fpclassps512_mask((__v16sf)(__m512)(A), \ + (int)(imm), (__mmask16)(U))) + +#define _mm512_fpclass_ps_mask(A, imm) \ + ((__mmask16)__builtin_ia32_fpclassps512_mask((__v16sf)(__m512)(A), \ + (int)(imm), (__mmask16)-1)) + +#define _mm512_mask_fpclass_pd_mask(U, A, imm) \ + ((__mmask8)__builtin_ia32_fpclasspd512_mask((__v8df)(__m512d)(A), (int)(imm), \ + (__mmask8)(U))) + +#define _mm512_fpclass_pd_mask(A, imm) \ + ((__mmask8)__builtin_ia32_fpclasspd512_mask((__v8df)(__m512d)(A), (int)(imm), \ + (__mmask8)-1)) + +#define _mm_fpclass_sd_mask(A, imm) \ + ((__mmask8)__builtin_ia32_fpclasssd_mask((__v2df)(__m128d)(A), (int)(imm), \ + (__mmask8)-1)) + +#define _mm_mask_fpclass_sd_mask(U, A, imm) \ + ((__mmask8)__builtin_ia32_fpclasssd_mask((__v2df)(__m128d)(A), (int)(imm), \ + (__mmask8)(U))) + +#define _mm_fpclass_ss_mask(A, imm) \ + ((__mmask8)__builtin_ia32_fpclassss_mask((__v4sf)(__m128)(A), (int)(imm), \ + (__mmask8)-1)) + +#define _mm_mask_fpclass_ss_mask(U, A, imm) \ + ((__mmask8)__builtin_ia32_fpclassss_mask((__v4sf)(__m128)(A), (int)(imm), \ + (__mmask8)(U))) + +#undef __DEFAULT_FN_ATTRS512 +#undef __DEFAULT_FN_ATTRS + +#endif diff --git a/include-llvm/avx512erintrin.h b/include-llvm/avx512erintrin.h new file mode 100644 index 0000000..1c5a2d2 --- /dev/null +++ b/include-llvm/avx512erintrin.h @@ -0,0 +1,271 @@ +/*===---- avx512erintrin.h - AVX512ER intrinsics ---------------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __AVX512ERINTRIN_H +#define __AVX512ERINTRIN_H + +/* exp2a23 */ +#define _mm512_exp2a23_round_pd(A, R) \ + ((__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)-1, (int)(R))) + +#define _mm512_mask_exp2a23_round_pd(S, M, A, R) \ + ((__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(S), (__mmask8)(M), \ + (int)(R))) + +#define _mm512_maskz_exp2a23_round_pd(M, A, R) \ + ((__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)(M), (int)(R))) + +#define _mm512_exp2a23_pd(A) \ + _mm512_exp2a23_round_pd((A), _MM_FROUND_CUR_DIRECTION) + +#define _mm512_mask_exp2a23_pd(S, M, A) \ + _mm512_mask_exp2a23_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION) + +#define _mm512_maskz_exp2a23_pd(M, A) \ + _mm512_maskz_exp2a23_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION) + +#define _mm512_exp2a23_round_ps(A, R) \ + ((__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)-1, (int)(R))) + +#define _mm512_mask_exp2a23_round_ps(S, M, A, R) \ + ((__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(S), (__mmask16)(M), \ + (int)(R))) + +#define _mm512_maskz_exp2a23_round_ps(M, A, R) \ + ((__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)(M), (int)(R))) + +#define _mm512_exp2a23_ps(A) \ + _mm512_exp2a23_round_ps((A), _MM_FROUND_CUR_DIRECTION) + +#define _mm512_mask_exp2a23_ps(S, M, A) \ + _mm512_mask_exp2a23_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION) + +#define _mm512_maskz_exp2a23_ps(M, A) \ + _mm512_maskz_exp2a23_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION) + +/* rsqrt28 */ +#define _mm512_rsqrt28_round_pd(A, R) \ + ((__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)-1, (int)(R))) + +#define _mm512_mask_rsqrt28_round_pd(S, M, A, R) \ + ((__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(S), (__mmask8)(M), \ + (int)(R))) + +#define _mm512_maskz_rsqrt28_round_pd(M, A, R) \ + ((__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)(M), (int)(R))) + +#define _mm512_rsqrt28_pd(A) \ + _mm512_rsqrt28_round_pd((A), _MM_FROUND_CUR_DIRECTION) + +#define _mm512_mask_rsqrt28_pd(S, M, A) \ + _mm512_mask_rsqrt28_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION) + +#define _mm512_maskz_rsqrt28_pd(M, A) \ + _mm512_maskz_rsqrt28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION) + +#define _mm512_rsqrt28_round_ps(A, R) \ + ((__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)-1, (int)(R))) + +#define _mm512_mask_rsqrt28_round_ps(S, M, A, R) \ + ((__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(S), (__mmask16)(M), \ + (int)(R))) + +#define _mm512_maskz_rsqrt28_round_ps(M, A, R) \ + ((__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)(M), (int)(R))) + +#define _mm512_rsqrt28_ps(A) \ + _mm512_rsqrt28_round_ps((A), _MM_FROUND_CUR_DIRECTION) + +#define _mm512_mask_rsqrt28_ps(S, M, A) \ + _mm512_mask_rsqrt28_round_ps((S), (M), A, _MM_FROUND_CUR_DIRECTION) + +#define _mm512_maskz_rsqrt28_ps(M, A) \ + _mm512_maskz_rsqrt28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION) + +#define _mm_rsqrt28_round_ss(A, B, R) \ + ((__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)-1, (int)(R))) + +#define _mm_mask_rsqrt28_round_ss(S, M, A, B, R) \ + ((__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)(__m128)(S), \ + (__mmask8)(M), (int)(R))) + +#define _mm_maskz_rsqrt28_round_ss(M, A, B, R) \ + ((__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(M), (int)(R))) + +#define _mm_rsqrt28_ss(A, B) \ + _mm_rsqrt28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_mask_rsqrt28_ss(S, M, A, B) \ + _mm_mask_rsqrt28_round_ss((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_maskz_rsqrt28_ss(M, A, B) \ + _mm_maskz_rsqrt28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_rsqrt28_round_sd(A, B, R) \ + ((__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)-1, (int)(R))) + +#define _mm_mask_rsqrt28_round_sd(S, M, A, B, R) \ + ((__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)(__m128d)(S), \ + (__mmask8)(M), (int)(R))) + +#define _mm_maskz_rsqrt28_round_sd(M, A, B, R) \ + ((__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(M), (int)(R))) + +#define _mm_rsqrt28_sd(A, B) \ + _mm_rsqrt28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_mask_rsqrt28_sd(S, M, A, B) \ + _mm_mask_rsqrt28_round_sd((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_maskz_rsqrt28_sd(M, A, B) \ + _mm_maskz_rsqrt28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION) + +/* rcp28 */ +#define _mm512_rcp28_round_pd(A, R) \ + ((__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)-1, (int)(R))) + +#define _mm512_mask_rcp28_round_pd(S, M, A, R) \ + ((__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(S), (__mmask8)(M), \ + (int)(R))) + +#define _mm512_maskz_rcp28_round_pd(M, A, R) \ + ((__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)(M), (int)(R))) + +#define _mm512_rcp28_pd(A) \ + _mm512_rcp28_round_pd((A), _MM_FROUND_CUR_DIRECTION) + +#define _mm512_mask_rcp28_pd(S, M, A) \ + _mm512_mask_rcp28_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION) + +#define _mm512_maskz_rcp28_pd(M, A) \ + _mm512_maskz_rcp28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION) + +#define _mm512_rcp28_round_ps(A, R) \ + ((__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)-1, (int)(R))) + +#define _mm512_mask_rcp28_round_ps(S, M, A, R) \ + ((__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(S), (__mmask16)(M), \ + (int)(R))) + +#define _mm512_maskz_rcp28_round_ps(M, A, R) \ + ((__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)(M), (int)(R))) + +#define _mm512_rcp28_ps(A) \ + _mm512_rcp28_round_ps((A), _MM_FROUND_CUR_DIRECTION) + +#define _mm512_mask_rcp28_ps(S, M, A) \ + _mm512_mask_rcp28_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION) + +#define _mm512_maskz_rcp28_ps(M, A) \ + _mm512_maskz_rcp28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION) + +#define _mm_rcp28_round_ss(A, B, R) \ + ((__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)-1, (int)(R))) + +#define _mm_mask_rcp28_round_ss(S, M, A, B, R) \ + ((__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)(__m128)(S), \ + (__mmask8)(M), (int)(R))) + +#define _mm_maskz_rcp28_round_ss(M, A, B, R) \ + ((__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(M), (int)(R))) + +#define _mm_rcp28_ss(A, B) \ + _mm_rcp28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_mask_rcp28_ss(S, M, A, B) \ + _mm_mask_rcp28_round_ss((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_maskz_rcp28_ss(M, A, B) \ + _mm_maskz_rcp28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_rcp28_round_sd(A, B, R) \ + ((__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)-1, (int)(R))) + +#define _mm_mask_rcp28_round_sd(S, M, A, B, R) \ + ((__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)(__m128d)(S), \ + (__mmask8)(M), (int)(R))) + +#define _mm_maskz_rcp28_round_sd(M, A, B, R) \ + ((__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(M), (int)(R))) + +#define _mm_rcp28_sd(A, B) \ + _mm_rcp28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_mask_rcp28_sd(S, M, A, B) \ + _mm_mask_rcp28_round_sd((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#define _mm_maskz_rcp28_sd(M, A, B) \ + _mm_maskz_rcp28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION) + +#endif /* __AVX512ERINTRIN_H */ diff --git a/include-llvm/avx512fintrin.h b/include-llvm/avx512fintrin.h new file mode 100644 index 0000000..cd1dc82 --- /dev/null +++ b/include-llvm/avx512fintrin.h @@ -0,0 +1,9930 @@ +/*===---- avx512fintrin.h - AVX512F intrinsics -----------------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __AVX512FINTRIN_H +#define __AVX512FINTRIN_H + +typedef char __v64qi __attribute__((__vector_size__(64))); +typedef short __v32hi __attribute__((__vector_size__(64))); +typedef double __v8df __attribute__((__vector_size__(64))); +typedef float __v16sf __attribute__((__vector_size__(64))); +typedef long long __v8di __attribute__((__vector_size__(64))); +typedef int __v16si __attribute__((__vector_size__(64))); + +/* Unsigned types */ +typedef unsigned char __v64qu __attribute__((__vector_size__(64))); +typedef unsigned short __v32hu __attribute__((__vector_size__(64))); +typedef unsigned long long __v8du __attribute__((__vector_size__(64))); +typedef unsigned int __v16su __attribute__((__vector_size__(64))); + +/* We need an explicitly signed variant for char. Note that this shouldn't + * appear in the interface though. */ +typedef signed char __v64qs __attribute__((__vector_size__(64))); + +typedef float __m512 __attribute__((__vector_size__(64), __aligned__(64))); +typedef double __m512d __attribute__((__vector_size__(64), __aligned__(64))); +typedef long long __m512i __attribute__((__vector_size__(64), __aligned__(64))); + +typedef float __m512_u __attribute__((__vector_size__(64), __aligned__(1))); +typedef double __m512d_u __attribute__((__vector_size__(64), __aligned__(1))); +typedef long long __m512i_u __attribute__((__vector_size__(64), __aligned__(1))); + +typedef unsigned char __mmask8; +typedef unsigned short __mmask16; + +/* Rounding mode macros. */ +#define _MM_FROUND_TO_NEAREST_INT 0x00 +#define _MM_FROUND_TO_NEG_INF 0x01 +#define _MM_FROUND_TO_POS_INF 0x02 +#define _MM_FROUND_TO_ZERO 0x03 +#define _MM_FROUND_CUR_DIRECTION 0x04 + +/* Constants for integer comparison predicates */ +typedef enum { + _MM_CMPINT_EQ, /* Equal */ + _MM_CMPINT_LT, /* Less than */ + _MM_CMPINT_LE, /* Less than or Equal */ + _MM_CMPINT_UNUSED, + _MM_CMPINT_NE, /* Not Equal */ + _MM_CMPINT_NLT, /* Not Less than */ +#define _MM_CMPINT_GE _MM_CMPINT_NLT /* Greater than or Equal */ + _MM_CMPINT_NLE /* Not Less than or Equal */ +#define _MM_CMPINT_GT _MM_CMPINT_NLE /* Greater than */ +} _MM_CMPINT_ENUM; + +typedef enum +{ + _MM_PERM_AAAA = 0x00, _MM_PERM_AAAB = 0x01, _MM_PERM_AAAC = 0x02, + _MM_PERM_AAAD = 0x03, _MM_PERM_AABA = 0x04, _MM_PERM_AABB = 0x05, + _MM_PERM_AABC = 0x06, _MM_PERM_AABD = 0x07, _MM_PERM_AACA = 0x08, + _MM_PERM_AACB = 0x09, _MM_PERM_AACC = 0x0A, _MM_PERM_AACD = 0x0B, + _MM_PERM_AADA = 0x0C, _MM_PERM_AADB = 0x0D, _MM_PERM_AADC = 0x0E, + _MM_PERM_AADD = 0x0F, _MM_PERM_ABAA = 0x10, _MM_PERM_ABAB = 0x11, + _MM_PERM_ABAC = 0x12, _MM_PERM_ABAD = 0x13, _MM_PERM_ABBA = 0x14, + _MM_PERM_ABBB = 0x15, _MM_PERM_ABBC = 0x16, _MM_PERM_ABBD = 0x17, + _MM_PERM_ABCA = 0x18, _MM_PERM_ABCB = 0x19, _MM_PERM_ABCC = 0x1A, + _MM_PERM_ABCD = 0x1B, _MM_PERM_ABDA = 0x1C, _MM_PERM_ABDB = 0x1D, + _MM_PERM_ABDC = 0x1E, _MM_PERM_ABDD = 0x1F, _MM_PERM_ACAA = 0x20, + _MM_PERM_ACAB = 0x21, _MM_PERM_ACAC = 0x22, _MM_PERM_ACAD = 0x23, + _MM_PERM_ACBA = 0x24, _MM_PERM_ACBB = 0x25, _MM_PERM_ACBC = 0x26, + _MM_PERM_ACBD = 0x27, _MM_PERM_ACCA = 0x28, _MM_PERM_ACCB = 0x29, + _MM_PERM_ACCC = 0x2A, _MM_PERM_ACCD = 0x2B, _MM_PERM_ACDA = 0x2C, + _MM_PERM_ACDB = 0x2D, _MM_PERM_ACDC = 0x2E, _MM_PERM_ACDD = 0x2F, + _MM_PERM_ADAA = 0x30, _MM_PERM_ADAB = 0x31, _MM_PERM_ADAC = 0x32, + _MM_PERM_ADAD = 0x33, _MM_PERM_ADBA = 0x34, _MM_PERM_ADBB = 0x35, + _MM_PERM_ADBC = 0x36, _MM_PERM_ADBD = 0x37, _MM_PERM_ADCA = 0x38, + _MM_PERM_ADCB = 0x39, _MM_PERM_ADCC = 0x3A, _MM_PERM_ADCD = 0x3B, + _MM_PERM_ADDA = 0x3C, _MM_PERM_ADDB = 0x3D, _MM_PERM_ADDC = 0x3E, + _MM_PERM_ADDD = 0x3F, _MM_PERM_BAAA = 0x40, _MM_PERM_BAAB = 0x41, + _MM_PERM_BAAC = 0x42, _MM_PERM_BAAD = 0x43, _MM_PERM_BABA = 0x44, + _MM_PERM_BABB = 0x45, _MM_PERM_BABC = 0x46, _MM_PERM_BABD = 0x47, + _MM_PERM_BACA = 0x48, _MM_PERM_BACB = 0x49, _MM_PERM_BACC = 0x4A, + _MM_PERM_BACD = 0x4B, _MM_PERM_BADA = 0x4C, _MM_PERM_BADB = 0x4D, + _MM_PERM_BADC = 0x4E, _MM_PERM_BADD = 0x4F, _MM_PERM_BBAA = 0x50, + _MM_PERM_BBAB = 0x51, _MM_PERM_BBAC = 0x52, _MM_PERM_BBAD = 0x53, + _MM_PERM_BBBA = 0x54, _MM_PERM_BBBB = 0x55, _MM_PERM_BBBC = 0x56, + _MM_PERM_BBBD = 0x57, _MM_PERM_BBCA = 0x58, _MM_PERM_BBCB = 0x59, + _MM_PERM_BBCC = 0x5A, _MM_PERM_BBCD = 0x5B, _MM_PERM_BBDA = 0x5C, + _MM_PERM_BBDB = 0x5D, _MM_PERM_BBDC = 0x5E, _MM_PERM_BBDD = 0x5F, + _MM_PERM_BCAA = 0x60, _MM_PERM_BCAB = 0x61, _MM_PERM_BCAC = 0x62, + _MM_PERM_BCAD = 0x63, _MM_PERM_BCBA = 0x64, _MM_PERM_BCBB = 0x65, + _MM_PERM_BCBC = 0x66, _MM_PERM_BCBD = 0x67, _MM_PERM_BCCA = 0x68, + _MM_PERM_BCCB = 0x69, _MM_PERM_BCCC = 0x6A, _MM_PERM_BCCD = 0x6B, + _MM_PERM_BCDA = 0x6C, _MM_PERM_BCDB = 0x6D, _MM_PERM_BCDC = 0x6E, + _MM_PERM_BCDD = 0x6F, _MM_PERM_BDAA = 0x70, _MM_PERM_BDAB = 0x71, + _MM_PERM_BDAC = 0x72, _MM_PERM_BDAD = 0x73, _MM_PERM_BDBA = 0x74, + _MM_PERM_BDBB = 0x75, _MM_PERM_BDBC = 0x76, _MM_PERM_BDBD = 0x77, + _MM_PERM_BDCA = 0x78, _MM_PERM_BDCB = 0x79, _MM_PERM_BDCC = 0x7A, + _MM_PERM_BDCD = 0x7B, _MM_PERM_BDDA = 0x7C, _MM_PERM_BDDB = 0x7D, + _MM_PERM_BDDC = 0x7E, _MM_PERM_BDDD = 0x7F, _MM_PERM_CAAA = 0x80, + _MM_PERM_CAAB = 0x81, _MM_PERM_CAAC = 0x82, _MM_PERM_CAAD = 0x83, + _MM_PERM_CABA = 0x84, _MM_PERM_CABB = 0x85, _MM_PERM_CABC = 0x86, + _MM_PERM_CABD = 0x87, _MM_PERM_CACA = 0x88, _MM_PERM_CACB = 0x89, + _MM_PERM_CACC = 0x8A, _MM_PERM_CACD = 0x8B, _MM_PERM_CADA = 0x8C, + _MM_PERM_CADB = 0x8D, _MM_PERM_CADC = 0x8E, _MM_PERM_CADD = 0x8F, + _MM_PERM_CBAA = 0x90, _MM_PERM_CBAB = 0x91, _MM_PERM_CBAC = 0x92, + _MM_PERM_CBAD = 0x93, _MM_PERM_CBBA = 0x94, _MM_PERM_CBBB = 0x95, + _MM_PERM_CBBC = 0x96, _MM_PERM_CBBD = 0x97, _MM_PERM_CBCA = 0x98, + _MM_PERM_CBCB = 0x99, _MM_PERM_CBCC = 0x9A, _MM_PERM_CBCD = 0x9B, + _MM_PERM_CBDA = 0x9C, _MM_PERM_CBDB = 0x9D, _MM_PERM_CBDC = 0x9E, + _MM_PERM_CBDD = 0x9F, _MM_PERM_CCAA = 0xA0, _MM_PERM_CCAB = 0xA1, + _MM_PERM_CCAC = 0xA2, _MM_PERM_CCAD = 0xA3, _MM_PERM_CCBA = 0xA4, + _MM_PERM_CCBB = 0xA5, _MM_PERM_CCBC = 0xA6, _MM_PERM_CCBD = 0xA7, + _MM_PERM_CCCA = 0xA8, _MM_PERM_CCCB = 0xA9, _MM_PERM_CCCC = 0xAA, + _MM_PERM_CCCD = 0xAB, _MM_PERM_CCDA = 0xAC, _MM_PERM_CCDB = 0xAD, + _MM_PERM_CCDC = 0xAE, _MM_PERM_CCDD = 0xAF, _MM_PERM_CDAA = 0xB0, + _MM_PERM_CDAB = 0xB1, _MM_PERM_CDAC = 0xB2, _MM_PERM_CDAD = 0xB3, + _MM_PERM_CDBA = 0xB4, _MM_PERM_CDBB = 0xB5, _MM_PERM_CDBC = 0xB6, + _MM_PERM_CDBD = 0xB7, _MM_PERM_CDCA = 0xB8, _MM_PERM_CDCB = 0xB9, + _MM_PERM_CDCC = 0xBA, _MM_PERM_CDCD = 0xBB, _MM_PERM_CDDA = 0xBC, + _MM_PERM_CDDB = 0xBD, _MM_PERM_CDDC = 0xBE, _MM_PERM_CDDD = 0xBF, + _MM_PERM_DAAA = 0xC0, _MM_PERM_DAAB = 0xC1, _MM_PERM_DAAC = 0xC2, + _MM_PERM_DAAD = 0xC3, _MM_PERM_DABA = 0xC4, _MM_PERM_DABB = 0xC5, + _MM_PERM_DABC = 0xC6, _MM_PERM_DABD = 0xC7, _MM_PERM_DACA = 0xC8, + _MM_PERM_DACB = 0xC9, _MM_PERM_DACC = 0xCA, _MM_PERM_DACD = 0xCB, + _MM_PERM_DADA = 0xCC, _MM_PERM_DADB = 0xCD, _MM_PERM_DADC = 0xCE, + _MM_PERM_DADD = 0xCF, _MM_PERM_DBAA = 0xD0, _MM_PERM_DBAB = 0xD1, + _MM_PERM_DBAC = 0xD2, _MM_PERM_DBAD = 0xD3, _MM_PERM_DBBA = 0xD4, + _MM_PERM_DBBB = 0xD5, _MM_PERM_DBBC = 0xD6, _MM_PERM_DBBD = 0xD7, + _MM_PERM_DBCA = 0xD8, _MM_PERM_DBCB = 0xD9, _MM_PERM_DBCC = 0xDA, + _MM_PERM_DBCD = 0xDB, _MM_PERM_DBDA = 0xDC, _MM_PERM_DBDB = 0xDD, + _MM_PERM_DBDC = 0xDE, _MM_PERM_DBDD = 0xDF, _MM_PERM_DCAA = 0xE0, + _MM_PERM_DCAB = 0xE1, _MM_PERM_DCAC = 0xE2, _MM_PERM_DCAD = 0xE3, + _MM_PERM_DCBA = 0xE4, _MM_PERM_DCBB = 0xE5, _MM_PERM_DCBC = 0xE6, + _MM_PERM_DCBD = 0xE7, _MM_PERM_DCCA = 0xE8, _MM_PERM_DCCB = 0xE9, + _MM_PERM_DCCC = 0xEA, _MM_PERM_DCCD = 0xEB, _MM_PERM_DCDA = 0xEC, + _MM_PERM_DCDB = 0xED, _MM_PERM_DCDC = 0xEE, _MM_PERM_DCDD = 0xEF, + _MM_PERM_DDAA = 0xF0, _MM_PERM_DDAB = 0xF1, _MM_PERM_DDAC = 0xF2, + _MM_PERM_DDAD = 0xF3, _MM_PERM_DDBA = 0xF4, _MM_PERM_DDBB = 0xF5, + _MM_PERM_DDBC = 0xF6, _MM_PERM_DDBD = 0xF7, _MM_PERM_DDCA = 0xF8, + _MM_PERM_DDCB = 0xF9, _MM_PERM_DDCC = 0xFA, _MM_PERM_DDCD = 0xFB, + _MM_PERM_DDDA = 0xFC, _MM_PERM_DDDB = 0xFD, _MM_PERM_DDDC = 0xFE, + _MM_PERM_DDDD = 0xFF +} _MM_PERM_ENUM; + +typedef enum +{ + _MM_MANT_NORM_1_2, /* interval [1, 2) */ + _MM_MANT_NORM_p5_2, /* interval [0.5, 2) */ + _MM_MANT_NORM_p5_1, /* interval [0.5, 1) */ + _MM_MANT_NORM_p75_1p5 /* interval [0.75, 1.5) */ +} _MM_MANTISSA_NORM_ENUM; + +typedef enum +{ + _MM_MANT_SIGN_src, /* sign = sign(SRC) */ + _MM_MANT_SIGN_zero, /* sign = 0 */ + _MM_MANT_SIGN_nan /* DEST = NaN if sign(SRC) = 1 */ +} _MM_MANTISSA_SIGN_ENUM; + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS512 __attribute__((__always_inline__, __nodebug__, __target__("avx512f"), __min_vector_width__(512))) +#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512f"), __min_vector_width__(128))) +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512f"))) + +/* Create vectors with repeated elements */ + +static __inline __m512i __DEFAULT_FN_ATTRS512 +_mm512_setzero_si512(void) +{ + return __extension__ (__m512i)(__v8di){ 0, 0, 0, 0, 0, 0, 0, 0 }; +} + +#define _mm512_setzero_epi32 _mm512_setzero_si512 + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_undefined_pd(void) +{ + return (__m512d)__builtin_ia32_undef512(); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_undefined(void) +{ + return (__m512)__builtin_ia32_undef512(); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_undefined_ps(void) +{ + return (__m512)__builtin_ia32_undef512(); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_undefined_epi32(void) +{ + return (__m512i)__builtin_ia32_undef512(); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_broadcastd_epi32 (__m128i __A) +{ + return (__m512i)__builtin_shufflevector((__v4si) __A, (__v4si) __A, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_broadcastd_epi32 (__m512i __O, __mmask16 __M, __m128i __A) +{ + return (__m512i)__builtin_ia32_selectd_512(__M, + (__v16si) _mm512_broadcastd_epi32(__A), + (__v16si) __O); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_broadcastd_epi32 (__mmask16 __M, __m128i __A) +{ + return (__m512i)__builtin_ia32_selectd_512(__M, + (__v16si) _mm512_broadcastd_epi32(__A), + (__v16si) _mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_broadcastq_epi64 (__m128i __A) +{ + return (__m512i)__builtin_shufflevector((__v2di) __A, (__v2di) __A, + 0, 0, 0, 0, 0, 0, 0, 0); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_broadcastq_epi64 (__m512i __O, __mmask8 __M, __m128i __A) +{ + return (__m512i)__builtin_ia32_selectq_512(__M, + (__v8di) _mm512_broadcastq_epi64(__A), + (__v8di) __O); + +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A) +{ + return (__m512i)__builtin_ia32_selectq_512(__M, + (__v8di) _mm512_broadcastq_epi64(__A), + (__v8di) _mm512_setzero_si512()); +} + + +static __inline __m512 __DEFAULT_FN_ATTRS512 +_mm512_setzero_ps(void) +{ + return __extension__ (__m512){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 }; +} + +#define _mm512_setzero _mm512_setzero_ps + +static __inline __m512d __DEFAULT_FN_ATTRS512 +_mm512_setzero_pd(void) +{ + return __extension__ (__m512d){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 }; +} + +static __inline __m512 __DEFAULT_FN_ATTRS512 +_mm512_set1_ps(float __w) +{ + return __extension__ (__m512){ __w, __w, __w, __w, __w, __w, __w, __w, + __w, __w, __w, __w, __w, __w, __w, __w }; +} + +static __inline __m512d __DEFAULT_FN_ATTRS512 +_mm512_set1_pd(double __w) +{ + return __extension__ (__m512d){ __w, __w, __w, __w, __w, __w, __w, __w }; +} + +static __inline __m512i __DEFAULT_FN_ATTRS512 +_mm512_set1_epi8(char __w) +{ + return __extension__ (__m512i)(__v64qi){ + __w, __w, __w, __w, __w, __w, __w, __w, + __w, __w, __w, __w, __w, __w, __w, __w, + __w, __w, __w, __w, __w, __w, __w, __w, + __w, __w, __w, __w, __w, __w, __w, __w, + __w, __w, __w, __w, __w, __w, __w, __w, + __w, __w, __w, __w, __w, __w, __w, __w, + __w, __w, __w, __w, __w, __w, __w, __w, + __w, __w, __w, __w, __w, __w, __w, __w }; +} + +static __inline __m512i __DEFAULT_FN_ATTRS512 +_mm512_set1_epi16(short __w) +{ + return __extension__ (__m512i)(__v32hi){ + __w, __w, __w, __w, __w, __w, __w, __w, + __w, __w, __w, __w, __w, __w, __w, __w, + __w, __w, __w, __w, __w, __w, __w, __w, + __w, __w, __w, __w, __w, __w, __w, __w }; +} + +static __inline __m512i __DEFAULT_FN_ATTRS512 +_mm512_set1_epi32(int __s) +{ + return __extension__ (__m512i)(__v16si){ + __s, __s, __s, __s, __s, __s, __s, __s, + __s, __s, __s, __s, __s, __s, __s, __s }; +} + +static __inline __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_set1_epi32(__mmask16 __M, int __A) +{ + return (__m512i)__builtin_ia32_selectd_512(__M, + (__v16si)_mm512_set1_epi32(__A), + (__v16si)_mm512_setzero_si512()); +} + +static __inline __m512i __DEFAULT_FN_ATTRS512 +_mm512_set1_epi64(long long __d) +{ + return __extension__(__m512i)(__v8di){ __d, __d, __d, __d, __d, __d, __d, __d }; +} + +static __inline __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_set1_epi64(__mmask8 __M, long long __A) +{ + return (__m512i)__builtin_ia32_selectq_512(__M, + (__v8di)_mm512_set1_epi64(__A), + (__v8di)_mm512_setzero_si512()); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_broadcastss_ps(__m128 __A) +{ + return (__m512)__builtin_shufflevector((__v4sf) __A, (__v4sf) __A, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); +} + +static __inline __m512i __DEFAULT_FN_ATTRS512 +_mm512_set4_epi32 (int __A, int __B, int __C, int __D) +{ + return __extension__ (__m512i)(__v16si) + { __D, __C, __B, __A, __D, __C, __B, __A, + __D, __C, __B, __A, __D, __C, __B, __A }; +} + +static __inline __m512i __DEFAULT_FN_ATTRS512 +_mm512_set4_epi64 (long long __A, long long __B, long long __C, + long long __D) +{ + return __extension__ (__m512i) (__v8di) + { __D, __C, __B, __A, __D, __C, __B, __A }; +} + +static __inline __m512d __DEFAULT_FN_ATTRS512 +_mm512_set4_pd (double __A, double __B, double __C, double __D) +{ + return __extension__ (__m512d) + { __D, __C, __B, __A, __D, __C, __B, __A }; +} + +static __inline __m512 __DEFAULT_FN_ATTRS512 +_mm512_set4_ps (float __A, float __B, float __C, float __D) +{ + return __extension__ (__m512) + { __D, __C, __B, __A, __D, __C, __B, __A, + __D, __C, __B, __A, __D, __C, __B, __A }; +} + +#define _mm512_setr4_epi32(e0,e1,e2,e3) \ + _mm512_set4_epi32((e3),(e2),(e1),(e0)) + +#define _mm512_setr4_epi64(e0,e1,e2,e3) \ + _mm512_set4_epi64((e3),(e2),(e1),(e0)) + +#define _mm512_setr4_pd(e0,e1,e2,e3) \ + _mm512_set4_pd((e3),(e2),(e1),(e0)) + +#define _mm512_setr4_ps(e0,e1,e2,e3) \ + _mm512_set4_ps((e3),(e2),(e1),(e0)) + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_broadcastsd_pd(__m128d __A) +{ + return (__m512d)__builtin_shufflevector((__v2df) __A, (__v2df) __A, + 0, 0, 0, 0, 0, 0, 0, 0); +} + +/* Cast between vector types */ + +static __inline __m512d __DEFAULT_FN_ATTRS512 +_mm512_castpd256_pd512(__m256d __a) +{ + return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, -1, -1, -1, -1); +} + +static __inline __m512 __DEFAULT_FN_ATTRS512 +_mm512_castps256_ps512(__m256 __a) +{ + return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, + -1, -1, -1, -1, -1, -1, -1, -1); +} + +static __inline __m128d __DEFAULT_FN_ATTRS512 +_mm512_castpd512_pd128(__m512d __a) +{ + return __builtin_shufflevector(__a, __a, 0, 1); +} + +static __inline __m256d __DEFAULT_FN_ATTRS512 +_mm512_castpd512_pd256 (__m512d __A) +{ + return __builtin_shufflevector(__A, __A, 0, 1, 2, 3); +} + +static __inline __m128 __DEFAULT_FN_ATTRS512 +_mm512_castps512_ps128(__m512 __a) +{ + return __builtin_shufflevector(__a, __a, 0, 1, 2, 3); +} + +static __inline __m256 __DEFAULT_FN_ATTRS512 +_mm512_castps512_ps256 (__m512 __A) +{ + return __builtin_shufflevector(__A, __A, 0, 1, 2, 3, 4, 5, 6, 7); +} + +static __inline __m512 __DEFAULT_FN_ATTRS512 +_mm512_castpd_ps (__m512d __A) +{ + return (__m512) (__A); +} + +static __inline __m512i __DEFAULT_FN_ATTRS512 +_mm512_castpd_si512 (__m512d __A) +{ + return (__m512i) (__A); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_castpd128_pd512 (__m128d __A) +{ + return __builtin_shufflevector( __A, __A, 0, 1, -1, -1, -1, -1, -1, -1); +} + +static __inline __m512d __DEFAULT_FN_ATTRS512 +_mm512_castps_pd (__m512 __A) +{ + return (__m512d) (__A); +} + +static __inline __m512i __DEFAULT_FN_ATTRS512 +_mm512_castps_si512 (__m512 __A) +{ + return (__m512i) (__A); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_castps128_ps512 (__m128 __A) +{ + return __builtin_shufflevector( __A, __A, 0, 1, 2, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_castsi128_si512 (__m128i __A) +{ + return __builtin_shufflevector( __A, __A, 0, 1, -1, -1, -1, -1, -1, -1); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_castsi256_si512 (__m256i __A) +{ + return __builtin_shufflevector( __A, __A, 0, 1, 2, 3, -1, -1, -1, -1); +} + +static __inline __m512 __DEFAULT_FN_ATTRS512 +_mm512_castsi512_ps (__m512i __A) +{ + return (__m512) (__A); +} + +static __inline __m512d __DEFAULT_FN_ATTRS512 +_mm512_castsi512_pd (__m512i __A) +{ + return (__m512d) (__A); +} + +static __inline __m128i __DEFAULT_FN_ATTRS512 +_mm512_castsi512_si128 (__m512i __A) +{ + return (__m128i)__builtin_shufflevector(__A, __A , 0, 1); +} + +static __inline __m256i __DEFAULT_FN_ATTRS512 +_mm512_castsi512_si256 (__m512i __A) +{ + return (__m256i)__builtin_shufflevector(__A, __A , 0, 1, 2, 3); +} + +static __inline__ __mmask16 __DEFAULT_FN_ATTRS +_mm512_int2mask(int __a) +{ + return (__mmask16)__a; +} + +static __inline__ int __DEFAULT_FN_ATTRS +_mm512_mask2int(__mmask16 __a) +{ + return (int)__a; +} + +/// Constructs a 512-bit floating-point vector of [8 x double] from a +/// 128-bit floating-point vector of [2 x double]. The lower 128 bits +/// contain the value of the source vector. The upper 384 bits are set +/// to zero. +/// +/// \headerfile +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. +/// \returns A 512-bit floating-point vector of [8 x double]. The lower 128 bits +/// contain the value of the parameter. The upper 384 bits are set to zero. +static __inline __m512d __DEFAULT_FN_ATTRS512 +_mm512_zextpd128_pd512(__m128d __a) +{ + return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3, 2, 3, 2, 3); +} + +/// Constructs a 512-bit floating-point vector of [8 x double] from a +/// 256-bit floating-point vector of [4 x double]. The lower 256 bits +/// contain the value of the source vector. The upper 256 bits are set +/// to zero. +/// +/// \headerfile +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double]. +/// \returns A 512-bit floating-point vector of [8 x double]. The lower 256 bits +/// contain the value of the parameter. The upper 256 bits are set to zero. +static __inline __m512d __DEFAULT_FN_ATTRS512 +_mm512_zextpd256_pd512(__m256d __a) +{ + return __builtin_shufflevector((__v4df)__a, (__v4df)_mm256_setzero_pd(), 0, 1, 2, 3, 4, 5, 6, 7); +} + +/// Constructs a 512-bit floating-point vector of [16 x float] from a +/// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain +/// the value of the source vector. The upper 384 bits are set to zero. +/// +/// \headerfile +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. +/// \returns A 512-bit floating-point vector of [16 x float]. The lower 128 bits +/// contain the value of the parameter. The upper 384 bits are set to zero. +static __inline __m512 __DEFAULT_FN_ATTRS512 +_mm512_zextps128_ps512(__m128 __a) +{ + return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7); +} + +/// Constructs a 512-bit floating-point vector of [16 x float] from a +/// 256-bit floating-point vector of [8 x float]. The lower 256 bits contain +/// the value of the source vector. The upper 256 bits are set to zero. +/// +/// \headerfile +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float]. +/// \returns A 512-bit floating-point vector of [16 x float]. The lower 256 bits +/// contain the value of the parameter. The upper 256 bits are set to zero. +static __inline __m512 __DEFAULT_FN_ATTRS512 +_mm512_zextps256_ps512(__m256 __a) +{ + return __builtin_shufflevector((__v8sf)__a, (__v8sf)_mm256_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); +} + +/// Constructs a 512-bit integer vector from a 128-bit integer vector. +/// The lower 128 bits contain the value of the source vector. The upper +/// 384 bits are set to zero. +/// +/// \headerfile +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 128-bit integer vector. +/// \returns A 512-bit integer vector. The lower 128 bits contain the value of +/// the parameter. The upper 384 bits are set to zero. +static __inline __m512i __DEFAULT_FN_ATTRS512 +_mm512_zextsi128_si512(__m128i __a) +{ + return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3, 2, 3, 2, 3); +} + +/// Constructs a 512-bit integer vector from a 256-bit integer vector. +/// The lower 256 bits contain the value of the source vector. The upper +/// 256 bits are set to zero. +/// +/// \headerfile +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 256-bit integer vector. +/// \returns A 512-bit integer vector. The lower 256 bits contain the value of +/// the parameter. The upper 256 bits are set to zero. +static __inline __m512i __DEFAULT_FN_ATTRS512 +_mm512_zextsi256_si512(__m256i __a) +{ + return __builtin_shufflevector((__v4di)__a, (__v4di)_mm256_setzero_si256(), 0, 1, 2, 3, 4, 5, 6, 7); +} + +/* Bitwise operators */ +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_and_epi32(__m512i __a, __m512i __b) +{ + return (__m512i)((__v16su)__a & (__v16su)__b); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_and_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k, + (__v16si) _mm512_and_epi32(__a, __b), + (__v16si) __src); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_and_epi32(__mmask16 __k, __m512i __a, __m512i __b) +{ + return (__m512i) _mm512_mask_and_epi32(_mm512_setzero_si512 (), + __k, __a, __b); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_and_epi64(__m512i __a, __m512i __b) +{ + return (__m512i)((__v8du)__a & (__v8du)__b); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_and_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b) +{ + return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __k, + (__v8di) _mm512_and_epi64(__a, __b), + (__v8di) __src); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_and_epi64(__mmask8 __k, __m512i __a, __m512i __b) +{ + return (__m512i) _mm512_mask_and_epi64(_mm512_setzero_si512 (), + __k, __a, __b); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_andnot_si512 (__m512i __A, __m512i __B) +{ + return (__m512i)(~(__v8du)__A & (__v8du)__B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_andnot_epi32 (__m512i __A, __m512i __B) +{ + return (__m512i)(~(__v16su)__A & (__v16su)__B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_andnot_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_andnot_epi32(__A, __B), + (__v16si)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_andnot_epi32(__mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i)_mm512_mask_andnot_epi32(_mm512_setzero_si512(), + __U, __A, __B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_andnot_epi64(__m512i __A, __m512i __B) +{ + return (__m512i)(~(__v8du)__A & (__v8du)__B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_andnot_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_andnot_epi64(__A, __B), + (__v8di)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_andnot_epi64(__mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i)_mm512_mask_andnot_epi64(_mm512_setzero_si512(), + __U, __A, __B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_or_epi32(__m512i __a, __m512i __b) +{ + return (__m512i)((__v16su)__a | (__v16su)__b); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_or_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k, + (__v16si)_mm512_or_epi32(__a, __b), + (__v16si)__src); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_or_epi32(__mmask16 __k, __m512i __a, __m512i __b) +{ + return (__m512i)_mm512_mask_or_epi32(_mm512_setzero_si512(), __k, __a, __b); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_or_epi64(__m512i __a, __m512i __b) +{ + return (__m512i)((__v8du)__a | (__v8du)__b); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_or_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__k, + (__v8di)_mm512_or_epi64(__a, __b), + (__v8di)__src); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_or_epi64(__mmask8 __k, __m512i __a, __m512i __b) +{ + return (__m512i)_mm512_mask_or_epi64(_mm512_setzero_si512(), __k, __a, __b); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_xor_epi32(__m512i __a, __m512i __b) +{ + return (__m512i)((__v16su)__a ^ (__v16su)__b); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_xor_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k, + (__v16si)_mm512_xor_epi32(__a, __b), + (__v16si)__src); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_xor_epi32(__mmask16 __k, __m512i __a, __m512i __b) +{ + return (__m512i)_mm512_mask_xor_epi32(_mm512_setzero_si512(), __k, __a, __b); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_xor_epi64(__m512i __a, __m512i __b) +{ + return (__m512i)((__v8du)__a ^ (__v8du)__b); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_xor_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__k, + (__v8di)_mm512_xor_epi64(__a, __b), + (__v8di)__src); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_xor_epi64(__mmask8 __k, __m512i __a, __m512i __b) +{ + return (__m512i)_mm512_mask_xor_epi64(_mm512_setzero_si512(), __k, __a, __b); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_and_si512(__m512i __a, __m512i __b) +{ + return (__m512i)((__v8du)__a & (__v8du)__b); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_or_si512(__m512i __a, __m512i __b) +{ + return (__m512i)((__v8du)__a | (__v8du)__b); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_xor_si512(__m512i __a, __m512i __b) +{ + return (__m512i)((__v8du)__a ^ (__v8du)__b); +} + +/* Arithmetic */ + +static __inline __m512d __DEFAULT_FN_ATTRS512 +_mm512_add_pd(__m512d __a, __m512d __b) +{ + return (__m512d)((__v8df)__a + (__v8df)__b); +} + +static __inline __m512 __DEFAULT_FN_ATTRS512 +_mm512_add_ps(__m512 __a, __m512 __b) +{ + return (__m512)((__v16sf)__a + (__v16sf)__b); +} + +static __inline __m512d __DEFAULT_FN_ATTRS512 +_mm512_mul_pd(__m512d __a, __m512d __b) +{ + return (__m512d)((__v8df)__a * (__v8df)__b); +} + +static __inline __m512 __DEFAULT_FN_ATTRS512 +_mm512_mul_ps(__m512 __a, __m512 __b) +{ + return (__m512)((__v16sf)__a * (__v16sf)__b); +} + +static __inline __m512d __DEFAULT_FN_ATTRS512 +_mm512_sub_pd(__m512d __a, __m512d __b) +{ + return (__m512d)((__v8df)__a - (__v8df)__b); +} + +static __inline __m512 __DEFAULT_FN_ATTRS512 +_mm512_sub_ps(__m512 __a, __m512 __b) +{ + return (__m512)((__v16sf)__a - (__v16sf)__b); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_add_epi64 (__m512i __A, __m512i __B) +{ + return (__m512i) ((__v8du) __A + (__v8du) __B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_add_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_add_epi64(__A, __B), + (__v8di)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_add_epi64(__mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_add_epi64(__A, __B), + (__v8di)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_sub_epi64 (__m512i __A, __m512i __B) +{ + return (__m512i) ((__v8du) __A - (__v8du) __B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_sub_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_sub_epi64(__A, __B), + (__v8di)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_sub_epi64(__mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_sub_epi64(__A, __B), + (__v8di)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_add_epi32 (__m512i __A, __m512i __B) +{ + return (__m512i) ((__v16su) __A + (__v16su) __B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_add_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_add_epi32(__A, __B), + (__v16si)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_add_epi32 (__mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_add_epi32(__A, __B), + (__v16si)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_sub_epi32 (__m512i __A, __m512i __B) +{ + return (__m512i) ((__v16su) __A - (__v16su) __B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_sub_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_sub_epi32(__A, __B), + (__v16si)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_sub_epi32(__mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_sub_epi32(__A, __B), + (__v16si)_mm512_setzero_si512()); +} + +#define _mm512_max_round_pd(A, B, R) \ + ((__m512d)__builtin_ia32_maxpd512((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), (int)(R))) + +#define _mm512_mask_max_round_pd(W, U, A, B, R) \ + ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__v8df)_mm512_max_round_pd((A), (B), (R)), \ + (__v8df)(W))) + +#define _mm512_maskz_max_round_pd(U, A, B, R) \ + ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__v8df)_mm512_max_round_pd((A), (B), (R)), \ + (__v8df)_mm512_setzero_pd())) + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_max_pd(__m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_maxpd512((__v8df) __A, (__v8df) __B, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_mask_max_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d)__builtin_ia32_selectpd_512(__U, + (__v8df)_mm512_max_pd(__A, __B), + (__v8df)__W); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_maskz_max_pd (__mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d)__builtin_ia32_selectpd_512(__U, + (__v8df)_mm512_max_pd(__A, __B), + (__v8df)_mm512_setzero_pd()); +} + +#define _mm512_max_round_ps(A, B, R) \ + ((__m512)__builtin_ia32_maxps512((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), (int)(R))) + +#define _mm512_mask_max_round_ps(W, U, A, B, R) \ + ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + (__v16sf)_mm512_max_round_ps((A), (B), (R)), \ + (__v16sf)(W))) + +#define _mm512_maskz_max_round_ps(U, A, B, R) \ + ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + (__v16sf)_mm512_max_round_ps((A), (B), (R)), \ + (__v16sf)_mm512_setzero_ps())) + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_max_ps(__m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_maxps512((__v16sf) __A, (__v16sf) __B, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask_max_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512)__builtin_ia32_selectps_512(__U, + (__v16sf)_mm512_max_ps(__A, __B), + (__v16sf)__W); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_maskz_max_ps (__mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512)__builtin_ia32_selectps_512(__U, + (__v16sf)_mm512_max_ps(__A, __B), + (__v16sf)_mm512_setzero_ps()); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask_max_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { + return (__m128) __builtin_ia32_maxss_round_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_max_ss(__mmask8 __U,__m128 __A, __m128 __B) { + return (__m128) __builtin_ia32_maxss_round_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) _mm_setzero_ps (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_max_round_ss(A, B, R) \ + ((__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)-1, (int)(R))) + +#define _mm_mask_max_round_ss(W, U, A, B, R) \ + ((__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)(__m128)(W), (__mmask8)(U), \ + (int)(R))) + +#define _mm_maskz_max_round_ss(U, A, B, R) \ + ((__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask_max_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { + return (__m128d) __builtin_ia32_maxsd_round_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_maskz_max_sd(__mmask8 __U,__m128d __A, __m128d __B) { + return (__m128d) __builtin_ia32_maxsd_round_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) _mm_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_max_round_sd(A, B, R) \ + ((__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)-1, (int)(R))) + +#define _mm_mask_max_round_sd(W, U, A, B, R) \ + ((__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)(__m128d)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm_maskz_max_round_sd(U, A, B, R) \ + ((__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(U), (int)(R))) + +static __inline __m512i +__DEFAULT_FN_ATTRS512 +_mm512_max_epi32(__m512i __A, __m512i __B) +{ +#if (__clang_major__ < 14) + return (__m512i)__builtin_ia32_pmaxsd512((__v16si)__A, (__v16si)__B); +#else + return (__m512i)__builtin_elementwise_max((__v16si)__A, (__v16si)__B); +#endif +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_max_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, + (__v16si)_mm512_max_epi32(__A, __B), + (__v16si)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_max_epi32 (__mmask16 __M, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, + (__v16si)_mm512_max_epi32(__A, __B), + (__v16si)_mm512_setzero_si512()); +} + +static __inline __m512i __DEFAULT_FN_ATTRS512 +_mm512_max_epu32(__m512i __A, __m512i __B) +{ +#if (__clang_major__ < 14) + return (__m512i)__builtin_ia32_pmaxud512((__v16si)__A, (__v16si)__B); +#else + return (__m512i)__builtin_elementwise_max((__v16su)__A, (__v16su)__B); +#endif +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_max_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, + (__v16si)_mm512_max_epu32(__A, __B), + (__v16si)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_max_epu32 (__mmask16 __M, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, + (__v16si)_mm512_max_epu32(__A, __B), + (__v16si)_mm512_setzero_si512()); +} + +static __inline __m512i __DEFAULT_FN_ATTRS512 +_mm512_max_epi64(__m512i __A, __m512i __B) +{ +#if (__clang_major__ < 14) + return (__m512i)__builtin_ia32_pmaxsq512((__v8di)__A, (__v8di)__B); +#else + return (__m512i)__builtin_elementwise_max((__v8di)__A, (__v8di)__B); +#endif +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_max_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, + (__v8di)_mm512_max_epi64(__A, __B), + (__v8di)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_max_epi64 (__mmask8 __M, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, + (__v8di)_mm512_max_epi64(__A, __B), + (__v8di)_mm512_setzero_si512()); +} + +static __inline __m512i __DEFAULT_FN_ATTRS512 +_mm512_max_epu64(__m512i __A, __m512i __B) +{ +#if (__clang_major__ < 14) + return (__m512i)__builtin_ia32_pmaxuq512((__v8di)__A, (__v8di)__B); +#else + return (__m512i)__builtin_elementwise_max((__v8du)__A, (__v8du)__B); +#endif +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_max_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, + (__v8di)_mm512_max_epu64(__A, __B), + (__v8di)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_max_epu64 (__mmask8 __M, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, + (__v8di)_mm512_max_epu64(__A, __B), + (__v8di)_mm512_setzero_si512()); +} + +#define _mm512_min_round_pd(A, B, R) \ + ((__m512d)__builtin_ia32_minpd512((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), (int)(R))) + +#define _mm512_mask_min_round_pd(W, U, A, B, R) \ + ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__v8df)_mm512_min_round_pd((A), (B), (R)), \ + (__v8df)(W))) + +#define _mm512_maskz_min_round_pd(U, A, B, R) \ + ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__v8df)_mm512_min_round_pd((A), (B), (R)), \ + (__v8df)_mm512_setzero_pd())) + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_min_pd(__m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_minpd512((__v8df) __A, (__v8df) __B, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_mask_min_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d)__builtin_ia32_selectpd_512(__U, + (__v8df)_mm512_min_pd(__A, __B), + (__v8df)__W); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_maskz_min_pd (__mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d)__builtin_ia32_selectpd_512(__U, + (__v8df)_mm512_min_pd(__A, __B), + (__v8df)_mm512_setzero_pd()); +} + +#define _mm512_min_round_ps(A, B, R) \ + ((__m512)__builtin_ia32_minps512((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), (int)(R))) + +#define _mm512_mask_min_round_ps(W, U, A, B, R) \ + ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + (__v16sf)_mm512_min_round_ps((A), (B), (R)), \ + (__v16sf)(W))) + +#define _mm512_maskz_min_round_ps(U, A, B, R) \ + ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + (__v16sf)_mm512_min_round_ps((A), (B), (R)), \ + (__v16sf)_mm512_setzero_ps())) + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_min_ps(__m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_minps512((__v16sf) __A, (__v16sf) __B, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask_min_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512)__builtin_ia32_selectps_512(__U, + (__v16sf)_mm512_min_ps(__A, __B), + (__v16sf)__W); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_maskz_min_ps (__mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512)__builtin_ia32_selectps_512(__U, + (__v16sf)_mm512_min_ps(__A, __B), + (__v16sf)_mm512_setzero_ps()); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask_min_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { + return (__m128) __builtin_ia32_minss_round_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_min_ss(__mmask8 __U,__m128 __A, __m128 __B) { + return (__m128) __builtin_ia32_minss_round_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) _mm_setzero_ps (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_min_round_ss(A, B, R) \ + ((__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)-1, (int)(R))) + +#define _mm_mask_min_round_ss(W, U, A, B, R) \ + ((__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)(__m128)(W), (__mmask8)(U), \ + (int)(R))) + +#define _mm_maskz_min_round_ss(U, A, B, R) \ + ((__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask_min_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { + return (__m128d) __builtin_ia32_minsd_round_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_maskz_min_sd(__mmask8 __U,__m128d __A, __m128d __B) { + return (__m128d) __builtin_ia32_minsd_round_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) _mm_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_min_round_sd(A, B, R) \ + ((__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)-1, (int)(R))) + +#define _mm_mask_min_round_sd(W, U, A, B, R) \ + ((__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)(__m128d)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm_maskz_min_round_sd(U, A, B, R) \ + ((__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(U), (int)(R))) + +static __inline __m512i +__DEFAULT_FN_ATTRS512 +_mm512_min_epi32(__m512i __A, __m512i __B) +{ +#if (__clang_major__ < 14) + return (__m512i)__builtin_ia32_pminsd512((__v16si)__A, (__v16si)__B); +#else + return (__m512i)__builtin_elementwise_min((__v16si)__A, (__v16si)__B); +#endif +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_min_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, + (__v16si)_mm512_min_epi32(__A, __B), + (__v16si)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_min_epi32 (__mmask16 __M, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, + (__v16si)_mm512_min_epi32(__A, __B), + (__v16si)_mm512_setzero_si512()); +} + +static __inline __m512i __DEFAULT_FN_ATTRS512 +_mm512_min_epu32(__m512i __A, __m512i __B) +{ +#if (__clang_major__ < 14) + return (__m512i)__builtin_ia32_pminud512((__v16si)__A, (__v16si)__B); +#else + return (__m512i)__builtin_elementwise_min((__v16su)__A, (__v16su)__B); +#endif +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_min_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, + (__v16si)_mm512_min_epu32(__A, __B), + (__v16si)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_min_epu32 (__mmask16 __M, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, + (__v16si)_mm512_min_epu32(__A, __B), + (__v16si)_mm512_setzero_si512()); +} + +static __inline __m512i __DEFAULT_FN_ATTRS512 +_mm512_min_epi64(__m512i __A, __m512i __B) +{ +#if (__clang_major__ < 14) + return (__m512i)__builtin_ia32_pminsq512((__v8di)__A, (__v8di)__B); +#else + return (__m512i)__builtin_elementwise_min((__v8di)__A, (__v8di)__B); +#endif +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_min_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, + (__v8di)_mm512_min_epi64(__A, __B), + (__v8di)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_min_epi64 (__mmask8 __M, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, + (__v8di)_mm512_min_epi64(__A, __B), + (__v8di)_mm512_setzero_si512()); +} + +static __inline __m512i __DEFAULT_FN_ATTRS512 +_mm512_min_epu64(__m512i __A, __m512i __B) +{ +#if (__clang_major__ < 14) + return (__m512i)__builtin_ia32_pminuq512((__v8di)__A, (__v8di)__B); +#else + return (__m512i)__builtin_elementwise_min((__v8du)__A, (__v8du)__B); +#endif +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_min_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, + (__v8di)_mm512_min_epu64(__A, __B), + (__v8di)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_min_epu64 (__mmask8 __M, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, + (__v8di)_mm512_min_epu64(__A, __B), + (__v8di)_mm512_setzero_si512()); +} + +static __inline __m512i __DEFAULT_FN_ATTRS512 +_mm512_mul_epi32(__m512i __X, __m512i __Y) +{ + return (__m512i)__builtin_ia32_pmuldq512((__v16si)__X, (__v16si) __Y); +} + +static __inline __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_mul_epi32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, + (__v8di)_mm512_mul_epi32(__X, __Y), + (__v8di)__W); +} + +static __inline __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_mul_epi32(__mmask8 __M, __m512i __X, __m512i __Y) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, + (__v8di)_mm512_mul_epi32(__X, __Y), + (__v8di)_mm512_setzero_si512 ()); +} + +static __inline __m512i __DEFAULT_FN_ATTRS512 +_mm512_mul_epu32(__m512i __X, __m512i __Y) +{ + return (__m512i)__builtin_ia32_pmuludq512((__v16si)__X, (__v16si)__Y); +} + +static __inline __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_mul_epu32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, + (__v8di)_mm512_mul_epu32(__X, __Y), + (__v8di)__W); +} + +static __inline __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_mul_epu32(__mmask8 __M, __m512i __X, __m512i __Y) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, + (__v8di)_mm512_mul_epu32(__X, __Y), + (__v8di)_mm512_setzero_si512 ()); +} + +static __inline __m512i __DEFAULT_FN_ATTRS512 +_mm512_mullo_epi32 (__m512i __A, __m512i __B) +{ + return (__m512i) ((__v16su) __A * (__v16su) __B); +} + +static __inline __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_mullo_epi32(__mmask16 __M, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, + (__v16si)_mm512_mullo_epi32(__A, __B), + (__v16si)_mm512_setzero_si512()); +} + +static __inline __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_mullo_epi32(__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, + (__v16si)_mm512_mullo_epi32(__A, __B), + (__v16si)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mullox_epi64 (__m512i __A, __m512i __B) { + return (__m512i) ((__v8du) __A * (__v8du) __B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_mullox_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_mullox_epi64(__A, __B), + (__v8di)__W); +} + +#define _mm512_sqrt_round_pd(A, R) \ + ((__m512d)__builtin_ia32_sqrtpd512((__v8df)(__m512d)(A), (int)(R))) + +#define _mm512_mask_sqrt_round_pd(W, U, A, R) \ + ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__v8df)_mm512_sqrt_round_pd((A), (R)), \ + (__v8df)(__m512d)(W))) + +#define _mm512_maskz_sqrt_round_pd(U, A, R) \ + ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__v8df)_mm512_sqrt_round_pd((A), (R)), \ + (__v8df)_mm512_setzero_pd())) + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_sqrt_pd(__m512d __A) +{ + return (__m512d)__builtin_ia32_sqrtpd512((__v8df)__A, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_mask_sqrt_pd (__m512d __W, __mmask8 __U, __m512d __A) +{ + return (__m512d)__builtin_ia32_selectpd_512(__U, + (__v8df)_mm512_sqrt_pd(__A), + (__v8df)__W); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_maskz_sqrt_pd (__mmask8 __U, __m512d __A) +{ + return (__m512d)__builtin_ia32_selectpd_512(__U, + (__v8df)_mm512_sqrt_pd(__A), + (__v8df)_mm512_setzero_pd()); +} + +#define _mm512_sqrt_round_ps(A, R) \ + ((__m512)__builtin_ia32_sqrtps512((__v16sf)(__m512)(A), (int)(R))) + +#define _mm512_mask_sqrt_round_ps(W, U, A, R) \ + ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + (__v16sf)_mm512_sqrt_round_ps((A), (R)), \ + (__v16sf)(__m512)(W))) + +#define _mm512_maskz_sqrt_round_ps(U, A, R) \ + ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + (__v16sf)_mm512_sqrt_round_ps((A), (R)), \ + (__v16sf)_mm512_setzero_ps())) + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_sqrt_ps(__m512 __A) +{ + return (__m512)__builtin_ia32_sqrtps512((__v16sf)__A, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask_sqrt_ps(__m512 __W, __mmask16 __U, __m512 __A) +{ + return (__m512)__builtin_ia32_selectps_512(__U, + (__v16sf)_mm512_sqrt_ps(__A), + (__v16sf)__W); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_maskz_sqrt_ps( __mmask16 __U, __m512 __A) +{ + return (__m512)__builtin_ia32_selectps_512(__U, + (__v16sf)_mm512_sqrt_ps(__A), + (__v16sf)_mm512_setzero_ps()); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_rsqrt14_pd(__m512d __A) +{ + return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1);} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_mask_rsqrt14_pd (__m512d __W, __mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) __U); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_maskz_rsqrt14_pd (__mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_rsqrt14_ps(__m512 __A) +{ + return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask_rsqrt14_ps (__m512 __W, __mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_maskz_rsqrt14_ps (__mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_rsqrt14_ss(__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask_rsqrt14_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_rsqrt14_ss (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) _mm_setzero_ps (), + (__mmask8) __U); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_rsqrt14_sd(__m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_rsqrt14sd_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask_rsqrt14_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_rsqrt14sd_mask ( (__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_maskz_rsqrt14_sd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_rsqrt14sd_mask ( (__v2df) __A, + (__v2df) __B, + (__v2df) _mm_setzero_pd (), + (__mmask8) __U); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_rcp14_pd(__m512d __A) +{ + return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) -1); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_mask_rcp14_pd (__m512d __W, __mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) __U); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_maskz_rcp14_pd (__mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_rcp14_ps(__m512 __A) +{ + return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask_rcp14_ps (__m512 __W, __mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_maskz_rcp14_ps (__mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_rcp14_ss(__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask_rcp14_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_rcp14_ss (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) _mm_setzero_ps (), + (__mmask8) __U); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_rcp14_sd(__m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_rcp14sd_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask_rcp14_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_rcp14sd_mask ( (__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_maskz_rcp14_sd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_rcp14sd_mask ( (__v2df) __A, + (__v2df) __B, + (__v2df) _mm_setzero_pd (), + (__mmask8) __U); +} + +static __inline __m512 __DEFAULT_FN_ATTRS512 +_mm512_floor_ps(__m512 __A) +{ + return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, + _MM_FROUND_FLOOR, + (__v16sf) __A, -1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask_floor_ps (__m512 __W, __mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, + _MM_FROUND_FLOOR, + (__v16sf) __W, __U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline __m512d __DEFAULT_FN_ATTRS512 +_mm512_floor_pd(__m512d __A) +{ + return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, + _MM_FROUND_FLOOR, + (__v8df) __A, -1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_mask_floor_pd (__m512d __W, __mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, + _MM_FROUND_FLOOR, + (__v8df) __W, __U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask_ceil_ps (__m512 __W, __mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, + _MM_FROUND_CEIL, + (__v16sf) __W, __U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline __m512 __DEFAULT_FN_ATTRS512 +_mm512_ceil_ps(__m512 __A) +{ + return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, + _MM_FROUND_CEIL, + (__v16sf) __A, -1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline __m512d __DEFAULT_FN_ATTRS512 +_mm512_ceil_pd(__m512d __A) +{ + return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, + _MM_FROUND_CEIL, + (__v8df) __A, -1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_mask_ceil_pd (__m512d __W, __mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, + _MM_FROUND_CEIL, + (__v8df) __W, __U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline __m512i __DEFAULT_FN_ATTRS512 +_mm512_abs_epi64(__m512i __A) +{ +#if (__clang_major__ < 14) + return (__m512i)__builtin_ia32_pabsq512((__v8di)__A); +#else + return (__m512i)__builtin_elementwise_abs((__v8di)__A); +#endif +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_abs_epi64 (__m512i __W, __mmask8 __U, __m512i __A) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_abs_epi64(__A), + (__v8di)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_abs_epi64 (__mmask8 __U, __m512i __A) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_abs_epi64(__A), + (__v8di)_mm512_setzero_si512()); +} + +static __inline __m512i __DEFAULT_FN_ATTRS512 +_mm512_abs_epi32(__m512i __A) +{ +#if (__clang_major__ < 14) + return (__m512i)__builtin_ia32_pabsd512((__v16si) __A); +#else + return (__m512i)__builtin_elementwise_abs((__v16si) __A); +#endif +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_abs_epi32 (__m512i __W, __mmask16 __U, __m512i __A) +{ + return (__m512i)__builtin_ia32_selectd_512(__U, + (__v16si)_mm512_abs_epi32(__A), + (__v16si)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_abs_epi32 (__mmask16 __U, __m512i __A) +{ + return (__m512i)__builtin_ia32_selectd_512(__U, + (__v16si)_mm512_abs_epi32(__A), + (__v16si)_mm512_setzero_si512()); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask_add_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { + __A = _mm_add_ss(__A, __B); + return __builtin_ia32_selectss_128(__U, __A, __W); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_add_ss(__mmask8 __U,__m128 __A, __m128 __B) { + __A = _mm_add_ss(__A, __B); + return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps()); +} + +#define _mm_add_round_ss(A, B, R) \ + ((__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)-1, (int)(R))) + +#define _mm_mask_add_round_ss(W, U, A, B, R) \ + ((__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)(__m128)(W), (__mmask8)(U), \ + (int)(R))) + +#define _mm_maskz_add_round_ss(U, A, B, R) \ + ((__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask_add_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { + __A = _mm_add_sd(__A, __B); + return __builtin_ia32_selectsd_128(__U, __A, __W); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_maskz_add_sd(__mmask8 __U,__m128d __A, __m128d __B) { + __A = _mm_add_sd(__A, __B); + return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd()); +} +#define _mm_add_round_sd(A, B, R) \ + ((__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)-1, (int)(R))) + +#define _mm_mask_add_round_sd(W, U, A, B, R) \ + ((__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)(__m128d)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm_maskz_add_round_sd(U, A, B, R) \ + ((__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_mask_add_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_add_pd(__A, __B), + (__v8df)__W); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_maskz_add_pd(__mmask8 __U, __m512d __A, __m512d __B) { + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_add_pd(__A, __B), + (__v8df)_mm512_setzero_pd()); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask_add_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_add_ps(__A, __B), + (__v16sf)__W); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_maskz_add_ps(__mmask16 __U, __m512 __A, __m512 __B) { + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_add_ps(__A, __B), + (__v16sf)_mm512_setzero_ps()); +} + +#define _mm512_add_round_pd(A, B, R) \ + ((__m512d)__builtin_ia32_addpd512((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), (int)(R))) + +#define _mm512_mask_add_round_pd(W, U, A, B, R) \ + ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__v8df)_mm512_add_round_pd((A), (B), (R)), \ + (__v8df)(__m512d)(W))) + +#define _mm512_maskz_add_round_pd(U, A, B, R) \ + ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__v8df)_mm512_add_round_pd((A), (B), (R)), \ + (__v8df)_mm512_setzero_pd())) + +#define _mm512_add_round_ps(A, B, R) \ + ((__m512)__builtin_ia32_addps512((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), (int)(R))) + +#define _mm512_mask_add_round_ps(W, U, A, B, R) \ + ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + (__v16sf)_mm512_add_round_ps((A), (B), (R)), \ + (__v16sf)(__m512)(W))) + +#define _mm512_maskz_add_round_ps(U, A, B, R) \ + ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + (__v16sf)_mm512_add_round_ps((A), (B), (R)), \ + (__v16sf)_mm512_setzero_ps())) + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask_sub_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { + __A = _mm_sub_ss(__A, __B); + return __builtin_ia32_selectss_128(__U, __A, __W); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_sub_ss(__mmask8 __U,__m128 __A, __m128 __B) { + __A = _mm_sub_ss(__A, __B); + return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps()); +} +#define _mm_sub_round_ss(A, B, R) \ + ((__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)-1, (int)(R))) + +#define _mm_mask_sub_round_ss(W, U, A, B, R) \ + ((__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)(__m128)(W), (__mmask8)(U), \ + (int)(R))) + +#define _mm_maskz_sub_round_ss(U, A, B, R) \ + ((__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask_sub_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { + __A = _mm_sub_sd(__A, __B); + return __builtin_ia32_selectsd_128(__U, __A, __W); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_maskz_sub_sd(__mmask8 __U,__m128d __A, __m128d __B) { + __A = _mm_sub_sd(__A, __B); + return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd()); +} + +#define _mm_sub_round_sd(A, B, R) \ + ((__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)-1, (int)(R))) + +#define _mm_mask_sub_round_sd(W, U, A, B, R) \ + ((__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)(__m128d)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm_maskz_sub_round_sd(U, A, B, R) \ + ((__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_mask_sub_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_sub_pd(__A, __B), + (__v8df)__W); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_maskz_sub_pd(__mmask8 __U, __m512d __A, __m512d __B) { + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_sub_pd(__A, __B), + (__v8df)_mm512_setzero_pd()); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask_sub_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_sub_ps(__A, __B), + (__v16sf)__W); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_maskz_sub_ps(__mmask16 __U, __m512 __A, __m512 __B) { + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_sub_ps(__A, __B), + (__v16sf)_mm512_setzero_ps()); +} + +#define _mm512_sub_round_pd(A, B, R) \ + ((__m512d)__builtin_ia32_subpd512((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), (int)(R))) + +#define _mm512_mask_sub_round_pd(W, U, A, B, R) \ + ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__v8df)_mm512_sub_round_pd((A), (B), (R)), \ + (__v8df)(__m512d)(W))) + +#define _mm512_maskz_sub_round_pd(U, A, B, R) \ + ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__v8df)_mm512_sub_round_pd((A), (B), (R)), \ + (__v8df)_mm512_setzero_pd())) + +#define _mm512_sub_round_ps(A, B, R) \ + ((__m512)__builtin_ia32_subps512((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), (int)(R))) + +#define _mm512_mask_sub_round_ps(W, U, A, B, R) \ + ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + (__v16sf)_mm512_sub_round_ps((A), (B), (R)), \ + (__v16sf)(__m512)(W))) + +#define _mm512_maskz_sub_round_ps(U, A, B, R) \ + ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + (__v16sf)_mm512_sub_round_ps((A), (B), (R)), \ + (__v16sf)_mm512_setzero_ps())) + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask_mul_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { + __A = _mm_mul_ss(__A, __B); + return __builtin_ia32_selectss_128(__U, __A, __W); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_mul_ss(__mmask8 __U,__m128 __A, __m128 __B) { + __A = _mm_mul_ss(__A, __B); + return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps()); +} +#define _mm_mul_round_ss(A, B, R) \ + ((__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)-1, (int)(R))) + +#define _mm_mask_mul_round_ss(W, U, A, B, R) \ + ((__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)(__m128)(W), (__mmask8)(U), \ + (int)(R))) + +#define _mm_maskz_mul_round_ss(U, A, B, R) \ + ((__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask_mul_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { + __A = _mm_mul_sd(__A, __B); + return __builtin_ia32_selectsd_128(__U, __A, __W); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_maskz_mul_sd(__mmask8 __U,__m128d __A, __m128d __B) { + __A = _mm_mul_sd(__A, __B); + return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd()); +} + +#define _mm_mul_round_sd(A, B, R) \ + ((__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)-1, (int)(R))) + +#define _mm_mask_mul_round_sd(W, U, A, B, R) \ + ((__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)(__m128d)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm_maskz_mul_round_sd(U, A, B, R) \ + ((__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_mask_mul_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_mul_pd(__A, __B), + (__v8df)__W); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_maskz_mul_pd(__mmask8 __U, __m512d __A, __m512d __B) { + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_mul_pd(__A, __B), + (__v8df)_mm512_setzero_pd()); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask_mul_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_mul_ps(__A, __B), + (__v16sf)__W); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_maskz_mul_ps(__mmask16 __U, __m512 __A, __m512 __B) { + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_mul_ps(__A, __B), + (__v16sf)_mm512_setzero_ps()); +} + +#define _mm512_mul_round_pd(A, B, R) \ + ((__m512d)__builtin_ia32_mulpd512((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), (int)(R))) + +#define _mm512_mask_mul_round_pd(W, U, A, B, R) \ + ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__v8df)_mm512_mul_round_pd((A), (B), (R)), \ + (__v8df)(__m512d)(W))) + +#define _mm512_maskz_mul_round_pd(U, A, B, R) \ + ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__v8df)_mm512_mul_round_pd((A), (B), (R)), \ + (__v8df)_mm512_setzero_pd())) + +#define _mm512_mul_round_ps(A, B, R) \ + ((__m512)__builtin_ia32_mulps512((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), (int)(R))) + +#define _mm512_mask_mul_round_ps(W, U, A, B, R) \ + ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + (__v16sf)_mm512_mul_round_ps((A), (B), (R)), \ + (__v16sf)(__m512)(W))) + +#define _mm512_maskz_mul_round_ps(U, A, B, R) \ + ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + (__v16sf)_mm512_mul_round_ps((A), (B), (R)), \ + (__v16sf)_mm512_setzero_ps())) + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask_div_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { + __A = _mm_div_ss(__A, __B); + return __builtin_ia32_selectss_128(__U, __A, __W); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_div_ss(__mmask8 __U,__m128 __A, __m128 __B) { + __A = _mm_div_ss(__A, __B); + return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps()); +} + +#define _mm_div_round_ss(A, B, R) \ + ((__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)-1, (int)(R))) + +#define _mm_mask_div_round_ss(W, U, A, B, R) \ + ((__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)(__m128)(W), (__mmask8)(U), \ + (int)(R))) + +#define _mm_maskz_div_round_ss(U, A, B, R) \ + ((__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask_div_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { + __A = _mm_div_sd(__A, __B); + return __builtin_ia32_selectsd_128(__U, __A, __W); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_maskz_div_sd(__mmask8 __U,__m128d __A, __m128d __B) { + __A = _mm_div_sd(__A, __B); + return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd()); +} + +#define _mm_div_round_sd(A, B, R) \ + ((__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)-1, (int)(R))) + +#define _mm_mask_div_round_sd(W, U, A, B, R) \ + ((__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)(__m128d)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm_maskz_div_round_sd(U, A, B, R) \ + ((__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(U), (int)(R))) + +static __inline __m512d __DEFAULT_FN_ATTRS512 +_mm512_div_pd(__m512d __a, __m512d __b) +{ + return (__m512d)((__v8df)__a/(__v8df)__b); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_mask_div_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_div_pd(__A, __B), + (__v8df)__W); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_maskz_div_pd(__mmask8 __U, __m512d __A, __m512d __B) { + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_div_pd(__A, __B), + (__v8df)_mm512_setzero_pd()); +} + +static __inline __m512 __DEFAULT_FN_ATTRS512 +_mm512_div_ps(__m512 __a, __m512 __b) +{ + return (__m512)((__v16sf)__a/(__v16sf)__b); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask_div_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_div_ps(__A, __B), + (__v16sf)__W); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_maskz_div_ps(__mmask16 __U, __m512 __A, __m512 __B) { + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_div_ps(__A, __B), + (__v16sf)_mm512_setzero_ps()); +} + +#define _mm512_div_round_pd(A, B, R) \ + ((__m512d)__builtin_ia32_divpd512((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), (int)(R))) + +#define _mm512_mask_div_round_pd(W, U, A, B, R) \ + ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__v8df)_mm512_div_round_pd((A), (B), (R)), \ + (__v8df)(__m512d)(W))) + +#define _mm512_maskz_div_round_pd(U, A, B, R) \ + ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__v8df)_mm512_div_round_pd((A), (B), (R)), \ + (__v8df)_mm512_setzero_pd())) + +#define _mm512_div_round_ps(A, B, R) \ + ((__m512)__builtin_ia32_divps512((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), (int)(R))) + +#define _mm512_mask_div_round_ps(W, U, A, B, R) \ + ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + (__v16sf)_mm512_div_round_ps((A), (B), (R)), \ + (__v16sf)(__m512)(W))) + +#define _mm512_maskz_div_round_ps(U, A, B, R) \ + ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + (__v16sf)_mm512_div_round_ps((A), (B), (R)), \ + (__v16sf)_mm512_setzero_ps())) + +#define _mm512_roundscale_ps(A, B) \ + ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(B), \ + (__v16sf)_mm512_undefined_ps(), \ + (__mmask16)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_roundscale_ps(A, B, C, imm) \ + ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(C), (int)(imm), \ + (__v16sf)(__m512)(A), (__mmask16)(B), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_maskz_roundscale_ps(A, B, imm) \ + ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(B), (int)(imm), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)(A), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_roundscale_round_ps(A, B, C, imm, R) \ + ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(C), (int)(imm), \ + (__v16sf)(__m512)(A), (__mmask16)(B), \ + (int)(R))) + +#define _mm512_maskz_roundscale_round_ps(A, B, imm, R) \ + ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(B), (int)(imm), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)(A), (int)(R))) + +#define _mm512_roundscale_round_ps(A, imm, R) \ + ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(imm), \ + (__v16sf)_mm512_undefined_ps(), \ + (__mmask16)-1, (int)(R))) + +#define _mm512_roundscale_pd(A, B) \ + ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(A), (int)(B), \ + (__v8df)_mm512_undefined_pd(), \ + (__mmask8)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_roundscale_pd(A, B, C, imm) \ + ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(C), (int)(imm), \ + (__v8df)(__m512d)(A), (__mmask8)(B), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_maskz_roundscale_pd(A, B, imm) \ + ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(B), (int)(imm), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)(A), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_roundscale_round_pd(A, B, C, imm, R) \ + ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(C), (int)(imm), \ + (__v8df)(__m512d)(A), (__mmask8)(B), \ + (int)(R))) + +#define _mm512_maskz_roundscale_round_pd(A, B, imm, R) \ + ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(B), (int)(imm), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)(A), (int)(R))) + +#define _mm512_roundscale_round_pd(A, imm, R) \ + ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(A), (int)(imm), \ + (__v8df)_mm512_undefined_pd(), \ + (__mmask8)-1, (int)(R))) + +#define _mm512_fmadd_round_pd(A, B, C, R) \ + ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(C), \ + (__mmask8)-1, (int)(R))) + + +#define _mm512_mask_fmadd_round_pd(A, U, B, C, R) \ + ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(C), \ + (__mmask8)(U), (int)(R))) + + +#define _mm512_mask3_fmadd_round_pd(A, B, C, U, R) \ + ((__m512d)__builtin_ia32_vfmaddpd512_mask3((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(C), \ + (__mmask8)(U), (int)(R))) + + +#define _mm512_maskz_fmadd_round_pd(U, A, B, C, R) \ + ((__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(C), \ + (__mmask8)(U), (int)(R))) + + +#define _mm512_fmsub_round_pd(A, B, C, R) \ + ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + -(__v8df)(__m512d)(C), \ + (__mmask8)-1, (int)(R))) + + +#define _mm512_mask_fmsub_round_pd(A, U, B, C, R) \ + ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + -(__v8df)(__m512d)(C), \ + (__mmask8)(U), (int)(R))) + + +#define _mm512_maskz_fmsub_round_pd(U, A, B, C, R) \ + ((__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + -(__v8df)(__m512d)(C), \ + (__mmask8)(U), (int)(R))) + + +#define _mm512_fnmadd_round_pd(A, B, C, R) \ + ((__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(C), \ + (__mmask8)-1, (int)(R))) + + +#define _mm512_mask3_fnmadd_round_pd(A, B, C, U, R) \ + ((__m512d)__builtin_ia32_vfmaddpd512_mask3(-(__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(C), \ + (__mmask8)(U), (int)(R))) + + +#define _mm512_maskz_fnmadd_round_pd(U, A, B, C, R) \ + ((__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(C), \ + (__mmask8)(U), (int)(R))) + + +#define _mm512_fnmsub_round_pd(A, B, C, R) \ + ((__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + -(__v8df)(__m512d)(C), \ + (__mmask8)-1, (int)(R))) + + +#define _mm512_maskz_fnmsub_round_pd(U, A, B, C, R) \ + ((__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + -(__v8df)(__m512d)(C), \ + (__mmask8)(U), (int)(R))) + + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_fmadd_pd(__m512d __A, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_mask_fmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_mask3_fmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) +{ + return (__m512d) __builtin_ia32_vfmaddpd512_mask3 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_maskz_fmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_fmsub_pd(__m512d __A, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_mask_fmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_maskz_fmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, + -(__v8df) __B, + (__v8df) __C, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_mask3_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) +{ + return (__m512d) __builtin_ia32_vfmaddpd512_mask3 (-(__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_maskz_fnmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, + -(__v8df) __B, + -(__v8df) __C, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_maskz_fnmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_fmadd_round_ps(A, B, C, R) \ + ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(C), \ + (__mmask16)-1, (int)(R))) + + +#define _mm512_mask_fmadd_round_ps(A, U, B, C, R) \ + ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(C), \ + (__mmask16)(U), (int)(R))) + + +#define _mm512_mask3_fmadd_round_ps(A, B, C, U, R) \ + ((__m512)__builtin_ia32_vfmaddps512_mask3((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(C), \ + (__mmask16)(U), (int)(R))) + + +#define _mm512_maskz_fmadd_round_ps(U, A, B, C, R) \ + ((__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(C), \ + (__mmask16)(U), (int)(R))) + + +#define _mm512_fmsub_round_ps(A, B, C, R) \ + ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + -(__v16sf)(__m512)(C), \ + (__mmask16)-1, (int)(R))) + + +#define _mm512_mask_fmsub_round_ps(A, U, B, C, R) \ + ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + -(__v16sf)(__m512)(C), \ + (__mmask16)(U), (int)(R))) + + +#define _mm512_maskz_fmsub_round_ps(U, A, B, C, R) \ + ((__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + -(__v16sf)(__m512)(C), \ + (__mmask16)(U), (int)(R))) + + +#define _mm512_fnmadd_round_ps(A, B, C, R) \ + ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ + -(__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(C), \ + (__mmask16)-1, (int)(R))) + + +#define _mm512_mask3_fnmadd_round_ps(A, B, C, U, R) \ + ((__m512)__builtin_ia32_vfmaddps512_mask3(-(__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(C), \ + (__mmask16)(U), (int)(R))) + + +#define _mm512_maskz_fnmadd_round_ps(U, A, B, C, R) \ + ((__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(C), \ + (__mmask16)(U), (int)(R))) + + +#define _mm512_fnmsub_round_ps(A, B, C, R) \ + ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ + -(__v16sf)(__m512)(B), \ + -(__v16sf)(__m512)(C), \ + (__mmask16)-1, (int)(R))) + + +#define _mm512_maskz_fnmsub_round_ps(U, A, B, C, R) \ + ((__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + -(__v16sf)(__m512)(C), \ + (__mmask16)(U), (int)(R))) + + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_fmadd_ps(__m512 __A, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask_fmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask3_fmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) +{ + return (__m512) __builtin_ia32_vfmaddps512_mask3 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_maskz_fmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_fmsub_ps(__m512 __A, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask_fmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_maskz_fmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, + -(__v16sf) __B, + (__v16sf) __C, + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask3_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) +{ + return (__m512) __builtin_ia32_vfmaddps512_mask3 (-(__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_maskz_fnmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, + -(__v16sf) __B, + -(__v16sf) __C, + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_maskz_fnmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_fmaddsub_round_pd(A, B, C, R) \ + ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(C), \ + (__mmask8)-1, (int)(R))) + + +#define _mm512_mask_fmaddsub_round_pd(A, U, B, C, R) \ + ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(C), \ + (__mmask8)(U), (int)(R))) + + +#define _mm512_mask3_fmaddsub_round_pd(A, B, C, U, R) \ + ((__m512d)__builtin_ia32_vfmaddsubpd512_mask3((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(C), \ + (__mmask8)(U), (int)(R))) + + +#define _mm512_maskz_fmaddsub_round_pd(U, A, B, C, R) \ + ((__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(C), \ + (__mmask8)(U), (int)(R))) + + +#define _mm512_fmsubadd_round_pd(A, B, C, R) \ + ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + -(__v8df)(__m512d)(C), \ + (__mmask8)-1, (int)(R))) + + +#define _mm512_mask_fmsubadd_round_pd(A, U, B, C, R) \ + ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + -(__v8df)(__m512d)(C), \ + (__mmask8)(U), (int)(R))) + + +#define _mm512_maskz_fmsubadd_round_pd(U, A, B, C, R) \ + ((__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + -(__v8df)(__m512d)(C), \ + (__mmask8)(U), (int)(R))) + + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_mask_fmaddsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_mask3_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) +{ + return (__m512d) __builtin_ia32_vfmaddsubpd512_mask3 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_maskz_fmaddsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_mask_fmsubadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_maskz_fmsubadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A, + (__v8df) __B, + -(__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_fmaddsub_round_ps(A, B, C, R) \ + ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(C), \ + (__mmask16)-1, (int)(R))) + + +#define _mm512_mask_fmaddsub_round_ps(A, U, B, C, R) \ + ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(C), \ + (__mmask16)(U), (int)(R))) + + +#define _mm512_mask3_fmaddsub_round_ps(A, B, C, U, R) \ + ((__m512)__builtin_ia32_vfmaddsubps512_mask3((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(C), \ + (__mmask16)(U), (int)(R))) + + +#define _mm512_maskz_fmaddsub_round_ps(U, A, B, C, R) \ + ((__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(C), \ + (__mmask16)(U), (int)(R))) + + +#define _mm512_fmsubadd_round_ps(A, B, C, R) \ + ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + -(__v16sf)(__m512)(C), \ + (__mmask16)-1, (int)(R))) + + +#define _mm512_mask_fmsubadd_round_ps(A, U, B, C, R) \ + ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + -(__v16sf)(__m512)(C), \ + (__mmask16)(U), (int)(R))) + + +#define _mm512_maskz_fmsubadd_round_ps(U, A, B, C, R) \ + ((__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + -(__v16sf)(__m512)(C), \ + (__mmask16)(U), (int)(R))) + + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask_fmaddsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask3_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) +{ + return (__m512) __builtin_ia32_vfmaddsubps512_mask3 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_maskz_fmaddsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask_fmsubadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_maskz_fmsubadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A, + (__v16sf) __B, + -(__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_mask3_fmsub_round_pd(A, B, C, U, R) \ + ((__m512d)__builtin_ia32_vfmsubpd512_mask3((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(C), \ + (__mmask8)(U), (int)(R))) + + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_mask3_fmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) +{ + return (__m512d)__builtin_ia32_vfmsubpd512_mask3 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_mask3_fmsub_round_ps(A, B, C, U, R) \ + ((__m512)__builtin_ia32_vfmsubps512_mask3((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(C), \ + (__mmask16)(U), (int)(R))) + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask3_fmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) +{ + return (__m512)__builtin_ia32_vfmsubps512_mask3 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_mask3_fmsubadd_round_pd(A, B, C, U, R) \ + ((__m512d)__builtin_ia32_vfmsubaddpd512_mask3((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(C), \ + (__mmask8)(U), (int)(R))) + + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_mask3_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) +{ + return (__m512d)__builtin_ia32_vfmsubaddpd512_mask3 ((__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_mask3_fmsubadd_round_ps(A, B, C, U, R) \ + ((__m512)__builtin_ia32_vfmsubaddps512_mask3((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(C), \ + (__mmask16)(U), (int)(R))) + + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask3_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) +{ + return (__m512)__builtin_ia32_vfmsubaddps512_mask3 ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_mask_fnmadd_round_pd(A, U, B, C, R) \ + ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ + -(__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(C), \ + (__mmask8)(U), (int)(R))) + + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_mask_fnmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, + -(__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_mask_fnmadd_round_ps(A, U, B, C, R) \ + ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ + -(__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(C), \ + (__mmask16)(U), (int)(R))) + + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask_fnmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, + -(__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_mask_fnmsub_round_pd(A, U, B, C, R) \ + ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ + -(__v8df)(__m512d)(B), \ + -(__v8df)(__m512d)(C), \ + (__mmask8)(U), (int)(R))) + + +#define _mm512_mask3_fnmsub_round_pd(A, B, C, U, R) \ + ((__m512d)__builtin_ia32_vfmsubpd512_mask3(-(__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(C), \ + (__mmask8)(U), (int)(R))) + + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_mask_fnmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) +{ + return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, + -(__v8df) __B, + -(__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_mask3_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) +{ + return (__m512d) __builtin_ia32_vfmsubpd512_mask3 (-(__v8df) __A, + (__v8df) __B, + (__v8df) __C, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_mask_fnmsub_round_ps(A, U, B, C, R) \ + ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ + -(__v16sf)(__m512)(B), \ + -(__v16sf)(__m512)(C), \ + (__mmask16)(U), (int)(R))) + + +#define _mm512_mask3_fnmsub_round_ps(A, B, C, U, R) \ + ((__m512)__builtin_ia32_vfmsubps512_mask3(-(__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(C), \ + (__mmask16)(U), (int)(R))) + + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask_fnmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) +{ + return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, + -(__v16sf) __B, + -(__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask3_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) +{ + return (__m512) __builtin_ia32_vfmsubps512_mask3 (-(__v16sf) __A, + (__v16sf) __B, + (__v16sf) __C, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + + + +/* Vector permutations */ + +static __inline __m512i __DEFAULT_FN_ATTRS512 +_mm512_permutex2var_epi32(__m512i __A, __m512i __I, __m512i __B) +{ + return (__m512i)__builtin_ia32_vpermi2vard512((__v16si)__A, (__v16si) __I, + (__v16si) __B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_permutex2var_epi32(__m512i __A, __mmask16 __U, __m512i __I, + __m512i __B) +{ + return (__m512i)__builtin_ia32_selectd_512(__U, + (__v16si)_mm512_permutex2var_epi32(__A, __I, __B), + (__v16si)__A); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask2_permutex2var_epi32(__m512i __A, __m512i __I, __mmask16 __U, + __m512i __B) +{ + return (__m512i)__builtin_ia32_selectd_512(__U, + (__v16si)_mm512_permutex2var_epi32(__A, __I, __B), + (__v16si)__I); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_permutex2var_epi32(__mmask16 __U, __m512i __A, __m512i __I, + __m512i __B) +{ + return (__m512i)__builtin_ia32_selectd_512(__U, + (__v16si)_mm512_permutex2var_epi32(__A, __I, __B), + (__v16si)_mm512_setzero_si512()); +} + +static __inline __m512i __DEFAULT_FN_ATTRS512 +_mm512_permutex2var_epi64(__m512i __A, __m512i __I, __m512i __B) +{ + return (__m512i)__builtin_ia32_vpermi2varq512((__v8di)__A, (__v8di) __I, + (__v8di) __B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_permutex2var_epi64(__m512i __A, __mmask8 __U, __m512i __I, + __m512i __B) +{ + return (__m512i)__builtin_ia32_selectq_512(__U, + (__v8di)_mm512_permutex2var_epi64(__A, __I, __B), + (__v8di)__A); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask2_permutex2var_epi64(__m512i __A, __m512i __I, __mmask8 __U, + __m512i __B) +{ + return (__m512i)__builtin_ia32_selectq_512(__U, + (__v8di)_mm512_permutex2var_epi64(__A, __I, __B), + (__v8di)__I); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_permutex2var_epi64(__mmask8 __U, __m512i __A, __m512i __I, + __m512i __B) +{ + return (__m512i)__builtin_ia32_selectq_512(__U, + (__v8di)_mm512_permutex2var_epi64(__A, __I, __B), + (__v8di)_mm512_setzero_si512()); +} + +#define _mm512_alignr_epi64(A, B, I) \ + ((__m512i)__builtin_ia32_alignq512((__v8di)(__m512i)(A), \ + (__v8di)(__m512i)(B), (int)(I))) + +#define _mm512_mask_alignr_epi64(W, U, A, B, imm) \ + ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ + (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \ + (__v8di)(__m512i)(W))) + +#define _mm512_maskz_alignr_epi64(U, A, B, imm) \ + ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ + (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \ + (__v8di)_mm512_setzero_si512())) + +#define _mm512_alignr_epi32(A, B, I) \ + ((__m512i)__builtin_ia32_alignd512((__v16si)(__m512i)(A), \ + (__v16si)(__m512i)(B), (int)(I))) + +#define _mm512_mask_alignr_epi32(W, U, A, B, imm) \ + ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ + (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \ + (__v16si)(__m512i)(W))) + +#define _mm512_maskz_alignr_epi32(U, A, B, imm) \ + ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ + (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \ + (__v16si)_mm512_setzero_si512())) +/* Vector Extract */ + +#define _mm512_extractf64x4_pd(A, I) \ + ((__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(I), \ + (__v4df)_mm256_undefined_pd(), \ + (__mmask8)-1)) + +#define _mm512_mask_extractf64x4_pd(W, U, A, imm) \ + ((__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \ + (__v4df)(__m256d)(W), \ + (__mmask8)(U))) + +#define _mm512_maskz_extractf64x4_pd(U, A, imm) \ + ((__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \ + (__v4df)_mm256_setzero_pd(), \ + (__mmask8)(U))) + +#define _mm512_extractf32x4_ps(A, I) \ + ((__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(I), \ + (__v4sf)_mm_undefined_ps(), \ + (__mmask8)-1)) + +#define _mm512_mask_extractf32x4_ps(W, U, A, imm) \ + ((__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \ + (__v4sf)(__m128)(W), \ + (__mmask8)(U))) + +#define _mm512_maskz_extractf32x4_ps(U, A, imm) \ + ((__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(U))) + +/* Vector Blend */ + +static __inline __m512d __DEFAULT_FN_ATTRS512 +_mm512_mask_blend_pd(__mmask8 __U, __m512d __A, __m512d __W) +{ + return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U, + (__v8df) __W, + (__v8df) __A); +} + +static __inline __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask_blend_ps(__mmask16 __U, __m512 __A, __m512 __W) +{ + return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U, + (__v16sf) __W, + (__v16sf) __A); +} + +static __inline __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_blend_epi64(__mmask8 __U, __m512i __A, __m512i __W) +{ + return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U, + (__v8di) __W, + (__v8di) __A); +} + +static __inline __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_blend_epi32(__mmask16 __U, __m512i __A, __m512i __W) +{ + return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U, + (__v16si) __W, + (__v16si) __A); +} + +/* Compare */ + +#define _mm512_cmp_round_ps_mask(A, B, P, R) \ + ((__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), (int)(P), \ + (__mmask16)-1, (int)(R))) + +#define _mm512_mask_cmp_round_ps_mask(U, A, B, P, R) \ + ((__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), (int)(P), \ + (__mmask16)(U), (int)(R))) + +#define _mm512_cmp_ps_mask(A, B, P) \ + _mm512_cmp_round_ps_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION) +#define _mm512_mask_cmp_ps_mask(U, A, B, P) \ + _mm512_mask_cmp_round_ps_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION) + +#define _mm512_cmpeq_ps_mask(A, B) \ + _mm512_cmp_ps_mask((A), (B), _CMP_EQ_OQ) +#define _mm512_mask_cmpeq_ps_mask(k, A, B) \ + _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_EQ_OQ) + +#define _mm512_cmplt_ps_mask(A, B) \ + _mm512_cmp_ps_mask((A), (B), _CMP_LT_OS) +#define _mm512_mask_cmplt_ps_mask(k, A, B) \ + _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_LT_OS) + +#define _mm512_cmple_ps_mask(A, B) \ + _mm512_cmp_ps_mask((A), (B), _CMP_LE_OS) +#define _mm512_mask_cmple_ps_mask(k, A, B) \ + _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_LE_OS) + +#define _mm512_cmpunord_ps_mask(A, B) \ + _mm512_cmp_ps_mask((A), (B), _CMP_UNORD_Q) +#define _mm512_mask_cmpunord_ps_mask(k, A, B) \ + _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_UNORD_Q) + +#define _mm512_cmpneq_ps_mask(A, B) \ + _mm512_cmp_ps_mask((A), (B), _CMP_NEQ_UQ) +#define _mm512_mask_cmpneq_ps_mask(k, A, B) \ + _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NEQ_UQ) + +#define _mm512_cmpnlt_ps_mask(A, B) \ + _mm512_cmp_ps_mask((A), (B), _CMP_NLT_US) +#define _mm512_mask_cmpnlt_ps_mask(k, A, B) \ + _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NLT_US) + +#define _mm512_cmpnle_ps_mask(A, B) \ + _mm512_cmp_ps_mask((A), (B), _CMP_NLE_US) +#define _mm512_mask_cmpnle_ps_mask(k, A, B) \ + _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NLE_US) + +#define _mm512_cmpord_ps_mask(A, B) \ + _mm512_cmp_ps_mask((A), (B), _CMP_ORD_Q) +#define _mm512_mask_cmpord_ps_mask(k, A, B) \ + _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_ORD_Q) + +#define _mm512_cmp_round_pd_mask(A, B, P, R) \ + ((__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), (int)(P), \ + (__mmask8)-1, (int)(R))) + +#define _mm512_mask_cmp_round_pd_mask(U, A, B, P, R) \ + ((__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), (int)(P), \ + (__mmask8)(U), (int)(R))) + +#define _mm512_cmp_pd_mask(A, B, P) \ + _mm512_cmp_round_pd_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION) +#define _mm512_mask_cmp_pd_mask(U, A, B, P) \ + _mm512_mask_cmp_round_pd_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION) + +#define _mm512_cmpeq_pd_mask(A, B) \ + _mm512_cmp_pd_mask((A), (B), _CMP_EQ_OQ) +#define _mm512_mask_cmpeq_pd_mask(k, A, B) \ + _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_EQ_OQ) + +#define _mm512_cmplt_pd_mask(A, B) \ + _mm512_cmp_pd_mask((A), (B), _CMP_LT_OS) +#define _mm512_mask_cmplt_pd_mask(k, A, B) \ + _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_LT_OS) + +#define _mm512_cmple_pd_mask(A, B) \ + _mm512_cmp_pd_mask((A), (B), _CMP_LE_OS) +#define _mm512_mask_cmple_pd_mask(k, A, B) \ + _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_LE_OS) + +#define _mm512_cmpunord_pd_mask(A, B) \ + _mm512_cmp_pd_mask((A), (B), _CMP_UNORD_Q) +#define _mm512_mask_cmpunord_pd_mask(k, A, B) \ + _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_UNORD_Q) + +#define _mm512_cmpneq_pd_mask(A, B) \ + _mm512_cmp_pd_mask((A), (B), _CMP_NEQ_UQ) +#define _mm512_mask_cmpneq_pd_mask(k, A, B) \ + _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NEQ_UQ) + +#define _mm512_cmpnlt_pd_mask(A, B) \ + _mm512_cmp_pd_mask((A), (B), _CMP_NLT_US) +#define _mm512_mask_cmpnlt_pd_mask(k, A, B) \ + _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NLT_US) + +#define _mm512_cmpnle_pd_mask(A, B) \ + _mm512_cmp_pd_mask((A), (B), _CMP_NLE_US) +#define _mm512_mask_cmpnle_pd_mask(k, A, B) \ + _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NLE_US) + +#define _mm512_cmpord_pd_mask(A, B) \ + _mm512_cmp_pd_mask((A), (B), _CMP_ORD_Q) +#define _mm512_mask_cmpord_pd_mask(k, A, B) \ + _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_ORD_Q) + +/* Conversion */ + +#define _mm512_cvtt_roundps_epu32(A, R) \ + ((__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \ + (__v16si)_mm512_undefined_epi32(), \ + (__mmask16)-1, (int)(R))) + +#define _mm512_mask_cvtt_roundps_epu32(W, U, A, R) \ + ((__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \ + (__v16si)(__m512i)(W), \ + (__mmask16)(U), (int)(R))) + +#define _mm512_maskz_cvtt_roundps_epu32(U, A, R) \ + ((__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \ + (__v16si)_mm512_setzero_si512(), \ + (__mmask16)(U), (int)(R))) + + +static __inline __m512i __DEFAULT_FN_ATTRS512 +_mm512_cvttps_epu32(__m512 __A) +{ + return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvttps_epu32 (__m512i __W, __mmask16 __U, __m512 __A) +{ + return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A, + (__v16si) __W, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvttps_epu32 (__mmask16 __U, __m512 __A) +{ + return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A, + (__v16si) _mm512_setzero_si512 (), + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_cvt_roundepi32_ps(A, R) \ + ((__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)-1, (int)(R))) + +#define _mm512_mask_cvt_roundepi32_ps(W, U, A, R) \ + ((__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \ + (__v16sf)(__m512)(W), \ + (__mmask16)(U), (int)(R))) + +#define _mm512_maskz_cvt_roundepi32_ps(U, A, R) \ + ((__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)(U), (int)(R))) + +#define _mm512_cvt_roundepu32_ps(A, R) \ + ((__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)-1, (int)(R))) + +#define _mm512_mask_cvt_roundepu32_ps(W, U, A, R) \ + ((__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \ + (__v16sf)(__m512)(W), \ + (__mmask16)(U), (int)(R))) + +#define _mm512_maskz_cvt_roundepu32_ps(U, A, R) \ + ((__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)(U), (int)(R))) + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_cvtepu32_ps (__m512i __A) +{ + return (__m512)__builtin_convertvector((__v16su)__A, __v16sf); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtepu32_ps (__m512 __W, __mmask16 __U, __m512i __A) +{ + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_cvtepu32_ps(__A), + (__v16sf)__W); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtepu32_ps (__mmask16 __U, __m512i __A) +{ + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_cvtepu32_ps(__A), + (__v16sf)_mm512_setzero_ps()); +} + +static __inline __m512d __DEFAULT_FN_ATTRS512 +_mm512_cvtepi32_pd(__m256i __A) +{ + return (__m512d)__builtin_convertvector((__v8si)__A, __v8df); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtepi32_pd (__m512d __W, __mmask8 __U, __m256i __A) +{ + return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, + (__v8df)_mm512_cvtepi32_pd(__A), + (__v8df)__W); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtepi32_pd (__mmask8 __U, __m256i __A) +{ + return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, + (__v8df)_mm512_cvtepi32_pd(__A), + (__v8df)_mm512_setzero_pd()); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_cvtepi32lo_pd(__m512i __A) +{ + return (__m512d) _mm512_cvtepi32_pd(_mm512_castsi512_si256(__A)); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtepi32lo_pd(__m512d __W, __mmask8 __U,__m512i __A) +{ + return (__m512d) _mm512_mask_cvtepi32_pd(__W, __U, _mm512_castsi512_si256(__A)); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_cvtepi32_ps (__m512i __A) +{ + return (__m512)__builtin_convertvector((__v16si)__A, __v16sf); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtepi32_ps (__m512 __W, __mmask16 __U, __m512i __A) +{ + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_cvtepi32_ps(__A), + (__v16sf)__W); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtepi32_ps (__mmask16 __U, __m512i __A) +{ + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_cvtepi32_ps(__A), + (__v16sf)_mm512_setzero_ps()); +} + +static __inline __m512d __DEFAULT_FN_ATTRS512 +_mm512_cvtepu32_pd(__m256i __A) +{ + return (__m512d)__builtin_convertvector((__v8su)__A, __v8df); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtepu32_pd (__m512d __W, __mmask8 __U, __m256i __A) +{ + return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, + (__v8df)_mm512_cvtepu32_pd(__A), + (__v8df)__W); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtepu32_pd (__mmask8 __U, __m256i __A) +{ + return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, + (__v8df)_mm512_cvtepu32_pd(__A), + (__v8df)_mm512_setzero_pd()); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_cvtepu32lo_pd(__m512i __A) +{ + return (__m512d) _mm512_cvtepu32_pd(_mm512_castsi512_si256(__A)); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtepu32lo_pd(__m512d __W, __mmask8 __U,__m512i __A) +{ + return (__m512d) _mm512_mask_cvtepu32_pd(__W, __U, _mm512_castsi512_si256(__A)); +} + +#define _mm512_cvt_roundpd_ps(A, R) \ + ((__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \ + (__v8sf)_mm256_setzero_ps(), \ + (__mmask8)-1, (int)(R))) + +#define _mm512_mask_cvt_roundpd_ps(W, U, A, R) \ + ((__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \ + (__v8sf)(__m256)(W), (__mmask8)(U), \ + (int)(R))) + +#define _mm512_maskz_cvt_roundpd_ps(U, A, R) \ + ((__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \ + (__v8sf)_mm256_setzero_ps(), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m256 __DEFAULT_FN_ATTRS512 +_mm512_cvtpd_ps (__m512d __A) +{ + return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A, + (__v8sf) _mm256_undefined_ps (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtpd_ps (__m256 __W, __mmask8 __U, __m512d __A) +{ + return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A, + (__v8sf) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtpd_ps (__mmask8 __U, __m512d __A) +{ + return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A, + (__v8sf) _mm256_setzero_ps (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_cvtpd_pslo (__m512d __A) +{ + return (__m512) __builtin_shufflevector((__v8sf) _mm512_cvtpd_ps(__A), + (__v8sf) _mm256_setzero_ps (), + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtpd_pslo (__m512 __W, __mmask8 __U,__m512d __A) +{ + return (__m512) __builtin_shufflevector ( + (__v8sf) _mm512_mask_cvtpd_ps (_mm512_castps512_ps256(__W), + __U, __A), + (__v8sf) _mm256_setzero_ps (), + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); +} + +#define _mm512_cvt_roundps_ph(A, I) \ + ((__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \ + (__v16hi)_mm256_undefined_si256(), \ + (__mmask16)-1)) + +#define _mm512_mask_cvt_roundps_ph(U, W, A, I) \ + ((__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \ + (__v16hi)(__m256i)(U), \ + (__mmask16)(W))) + +#define _mm512_maskz_cvt_roundps_ph(W, A, I) \ + ((__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \ + (__v16hi)_mm256_setzero_si256(), \ + (__mmask16)(W))) + +#define _mm512_cvtps_ph _mm512_cvt_roundps_ph +#define _mm512_mask_cvtps_ph _mm512_mask_cvt_roundps_ph +#define _mm512_maskz_cvtps_ph _mm512_maskz_cvt_roundps_ph + +#define _mm512_cvt_roundph_ps(A, R) \ + ((__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \ + (__v16sf)_mm512_undefined_ps(), \ + (__mmask16)-1, (int)(R))) + +#define _mm512_mask_cvt_roundph_ps(W, U, A, R) \ + ((__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \ + (__v16sf)(__m512)(W), \ + (__mmask16)(U), (int)(R))) + +#define _mm512_maskz_cvt_roundph_ps(U, A, R) \ + ((__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)(U), (int)(R))) + + +static __inline __m512 __DEFAULT_FN_ATTRS512 +_mm512_cvtph_ps(__m256i __A) +{ + return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtph_ps (__m512 __W, __mmask16 __U, __m256i __A) +{ + return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A, + (__v16sf) __W, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtph_ps (__mmask16 __U, __m256i __A) +{ + return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A, + (__v16sf) _mm512_setzero_ps (), + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_cvtt_roundpd_epi32(A, R) \ + ((__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \ + (__v8si)_mm256_setzero_si256(), \ + (__mmask8)-1, (int)(R))) + +#define _mm512_mask_cvtt_roundpd_epi32(W, U, A, R) \ + ((__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \ + (__v8si)(__m256i)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm512_maskz_cvtt_roundpd_epi32(U, A, R) \ + ((__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \ + (__v8si)_mm256_setzero_si256(), \ + (__mmask8)(U), (int)(R))) + +static __inline __m256i __DEFAULT_FN_ATTRS512 +_mm512_cvttpd_epi32(__m512d __a) +{ + return (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df) __a, + (__v8si)_mm256_setzero_si256(), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvttpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A) +{ + return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A, + (__v8si) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvttpd_epi32 (__mmask8 __U, __m512d __A) +{ + return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A, + (__v8si) _mm256_setzero_si256 (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_cvtt_roundps_epi32(A, R) \ + ((__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \ + (__v16si)_mm512_setzero_si512(), \ + (__mmask16)-1, (int)(R))) + +#define _mm512_mask_cvtt_roundps_epi32(W, U, A, R) \ + ((__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \ + (__v16si)(__m512i)(W), \ + (__mmask16)(U), (int)(R))) + +#define _mm512_maskz_cvtt_roundps_epi32(U, A, R) \ + ((__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \ + (__v16si)_mm512_setzero_si512(), \ + (__mmask16)(U), (int)(R))) + +static __inline __m512i __DEFAULT_FN_ATTRS512 +_mm512_cvttps_epi32(__m512 __a) +{ + return (__m512i) + __builtin_ia32_cvttps2dq512_mask((__v16sf) __a, + (__v16si) _mm512_setzero_si512 (), + (__mmask16) -1, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvttps_epi32 (__m512i __W, __mmask16 __U, __m512 __A) +{ + return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A, + (__v16si) __W, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvttps_epi32 (__mmask16 __U, __m512 __A) +{ + return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A, + (__v16si) _mm512_setzero_si512 (), + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_cvt_roundps_epi32(A, R) \ + ((__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \ + (__v16si)_mm512_setzero_si512(), \ + (__mmask16)-1, (int)(R))) + +#define _mm512_mask_cvt_roundps_epi32(W, U, A, R) \ + ((__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \ + (__v16si)(__m512i)(W), \ + (__mmask16)(U), (int)(R))) + +#define _mm512_maskz_cvt_roundps_epi32(U, A, R) \ + ((__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \ + (__v16si)_mm512_setzero_si512(), \ + (__mmask16)(U), (int)(R))) + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_cvtps_epi32 (__m512 __A) +{ + return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A, + (__v16si) _mm512_undefined_epi32 (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtps_epi32 (__m512i __W, __mmask16 __U, __m512 __A) +{ + return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A, + (__v16si) __W, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtps_epi32 (__mmask16 __U, __m512 __A) +{ + return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_cvt_roundpd_epi32(A, R) \ + ((__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \ + (__v8si)_mm256_setzero_si256(), \ + (__mmask8)-1, (int)(R))) + +#define _mm512_mask_cvt_roundpd_epi32(W, U, A, R) \ + ((__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \ + (__v8si)(__m256i)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm512_maskz_cvt_roundpd_epi32(U, A, R) \ + ((__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \ + (__v8si)_mm256_setzero_si256(), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_cvtpd_epi32 (__m512d __A) +{ + return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A, + (__v8si) + _mm256_undefined_si256 (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A) +{ + return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A, + (__v8si) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtpd_epi32 (__mmask8 __U, __m512d __A) +{ + return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_cvt_roundps_epu32(A, R) \ + ((__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \ + (__v16si)_mm512_setzero_si512(), \ + (__mmask16)-1, (int)(R))) + +#define _mm512_mask_cvt_roundps_epu32(W, U, A, R) \ + ((__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \ + (__v16si)(__m512i)(W), \ + (__mmask16)(U), (int)(R))) + +#define _mm512_maskz_cvt_roundps_epu32(U, A, R) \ + ((__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \ + (__v16si)_mm512_setzero_si512(), \ + (__mmask16)(U), (int)(R))) + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_cvtps_epu32 ( __m512 __A) +{ + return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,\ + (__v16si)\ + _mm512_undefined_epi32 (), + (__mmask16) -1,\ + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtps_epu32 (__m512i __W, __mmask16 __U, __m512 __A) +{ + return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A, + (__v16si) __W, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtps_epu32 ( __mmask16 __U, __m512 __A) +{ + return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U , + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_cvt_roundpd_epu32(A, R) \ + ((__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \ + (__v8si)_mm256_setzero_si256(), \ + (__mmask8)-1, (int)(R))) + +#define _mm512_mask_cvt_roundpd_epu32(W, U, A, R) \ + ((__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \ + (__v8si)(__m256i)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm512_maskz_cvt_roundpd_epu32(U, A, R) \ + ((__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \ + (__v8si)_mm256_setzero_si256(), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_cvtpd_epu32 (__m512d __A) +{ + return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A, + (__v8si) + _mm256_undefined_si256 (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A) +{ + return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A, + (__v8si) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtpd_epu32 (__mmask8 __U, __m512d __A) +{ + return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ double __DEFAULT_FN_ATTRS512 +_mm512_cvtsd_f64(__m512d __a) +{ + return __a[0]; +} + +static __inline__ float __DEFAULT_FN_ATTRS512 +_mm512_cvtss_f32(__m512 __a) +{ + return __a[0]; +} + +/* Unpack and Interleave */ + +static __inline __m512d __DEFAULT_FN_ATTRS512 +_mm512_unpackhi_pd(__m512d __a, __m512d __b) +{ + return (__m512d)__builtin_shufflevector((__v8df)__a, (__v8df)__b, + 1, 9, 1+2, 9+2, 1+4, 9+4, 1+6, 9+6); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_mask_unpackhi_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, + (__v8df)_mm512_unpackhi_pd(__A, __B), + (__v8df)__W); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_maskz_unpackhi_pd(__mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, + (__v8df)_mm512_unpackhi_pd(__A, __B), + (__v8df)_mm512_setzero_pd()); +} + +static __inline __m512d __DEFAULT_FN_ATTRS512 +_mm512_unpacklo_pd(__m512d __a, __m512d __b) +{ + return (__m512d)__builtin_shufflevector((__v8df)__a, (__v8df)__b, + 0, 8, 0+2, 8+2, 0+4, 8+4, 0+6, 8+6); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_mask_unpacklo_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, + (__v8df)_mm512_unpacklo_pd(__A, __B), + (__v8df)__W); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_maskz_unpacklo_pd (__mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, + (__v8df)_mm512_unpacklo_pd(__A, __B), + (__v8df)_mm512_setzero_pd()); +} + +static __inline __m512 __DEFAULT_FN_ATTRS512 +_mm512_unpackhi_ps(__m512 __a, __m512 __b) +{ + return (__m512)__builtin_shufflevector((__v16sf)__a, (__v16sf)__b, + 2, 18, 3, 19, + 2+4, 18+4, 3+4, 19+4, + 2+8, 18+8, 3+8, 19+8, + 2+12, 18+12, 3+12, 19+12); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask_unpackhi_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512)__builtin_ia32_selectps_512((__mmask16) __U, + (__v16sf)_mm512_unpackhi_ps(__A, __B), + (__v16sf)__W); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_maskz_unpackhi_ps (__mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512)__builtin_ia32_selectps_512((__mmask16) __U, + (__v16sf)_mm512_unpackhi_ps(__A, __B), + (__v16sf)_mm512_setzero_ps()); +} + +static __inline __m512 __DEFAULT_FN_ATTRS512 +_mm512_unpacklo_ps(__m512 __a, __m512 __b) +{ + return (__m512)__builtin_shufflevector((__v16sf)__a, (__v16sf)__b, + 0, 16, 1, 17, + 0+4, 16+4, 1+4, 17+4, + 0+8, 16+8, 1+8, 17+8, + 0+12, 16+12, 1+12, 17+12); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask_unpacklo_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512)__builtin_ia32_selectps_512((__mmask16) __U, + (__v16sf)_mm512_unpacklo_ps(__A, __B), + (__v16sf)__W); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_maskz_unpacklo_ps (__mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512)__builtin_ia32_selectps_512((__mmask16) __U, + (__v16sf)_mm512_unpacklo_ps(__A, __B), + (__v16sf)_mm512_setzero_ps()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_unpackhi_epi32(__m512i __A, __m512i __B) +{ + return (__m512i)__builtin_shufflevector((__v16si)__A, (__v16si)__B, + 2, 18, 3, 19, + 2+4, 18+4, 3+4, 19+4, + 2+8, 18+8, 3+8, 19+8, + 2+12, 18+12, 3+12, 19+12); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_unpackhi_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U, + (__v16si)_mm512_unpackhi_epi32(__A, __B), + (__v16si)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_unpackhi_epi32(__mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U, + (__v16si)_mm512_unpackhi_epi32(__A, __B), + (__v16si)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_unpacklo_epi32(__m512i __A, __m512i __B) +{ + return (__m512i)__builtin_shufflevector((__v16si)__A, (__v16si)__B, + 0, 16, 1, 17, + 0+4, 16+4, 1+4, 17+4, + 0+8, 16+8, 1+8, 17+8, + 0+12, 16+12, 1+12, 17+12); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_unpacklo_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U, + (__v16si)_mm512_unpacklo_epi32(__A, __B), + (__v16si)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_unpacklo_epi32(__mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U, + (__v16si)_mm512_unpacklo_epi32(__A, __B), + (__v16si)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_unpackhi_epi64(__m512i __A, __m512i __B) +{ + return (__m512i)__builtin_shufflevector((__v8di)__A, (__v8di)__B, + 1, 9, 1+2, 9+2, 1+4, 9+4, 1+6, 9+6); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_unpackhi_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U, + (__v8di)_mm512_unpackhi_epi64(__A, __B), + (__v8di)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_unpackhi_epi64(__mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U, + (__v8di)_mm512_unpackhi_epi64(__A, __B), + (__v8di)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_unpacklo_epi64 (__m512i __A, __m512i __B) +{ + return (__m512i)__builtin_shufflevector((__v8di)__A, (__v8di)__B, + 0, 8, 0+2, 8+2, 0+4, 8+4, 0+6, 8+6); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_unpacklo_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U, + (__v8di)_mm512_unpacklo_epi64(__A, __B), + (__v8di)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_unpacklo_epi64 (__mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U, + (__v8di)_mm512_unpacklo_epi64(__A, __B), + (__v8di)_mm512_setzero_si512()); +} + + +/* SIMD load ops */ + +static __inline __m512i __DEFAULT_FN_ATTRS512 +_mm512_loadu_si512 (void const *__P) +{ + struct __loadu_si512 { + __m512i_u __v; + } __attribute__((__packed__, __may_alias__)); + return ((const struct __loadu_si512*)__P)->__v; +} + +static __inline __m512i __DEFAULT_FN_ATTRS512 +_mm512_loadu_epi32 (void const *__P) +{ + struct __loadu_epi32 { + __m512i_u __v; + } __attribute__((__packed__, __may_alias__)); + return ((const struct __loadu_epi32*)__P)->__v; +} + +static __inline __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_loadu_epi32 (__m512i __W, __mmask16 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *) __P, + (__v16si) __W, + (__mmask16) __U); +} + + +static __inline __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_loadu_epi32(__mmask16 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *)__P, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +static __inline __m512i __DEFAULT_FN_ATTRS512 +_mm512_loadu_epi64 (void const *__P) +{ + struct __loadu_epi64 { + __m512i_u __v; + } __attribute__((__packed__, __may_alias__)); + return ((const struct __loadu_epi64*)__P)->__v; +} + +static __inline __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_loadu_epi64 (__m512i __W, __mmask8 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_loaddqudi512_mask ((const long long *) __P, + (__v8di) __W, + (__mmask8) __U); +} + +static __inline __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_loadu_epi64(__mmask8 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_loaddqudi512_mask ((const long long *)__P, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +static __inline __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask_loadu_ps (__m512 __W, __mmask16 __U, void const *__P) +{ + return (__m512) __builtin_ia32_loadups512_mask ((const float *) __P, + (__v16sf) __W, + (__mmask16) __U); +} + +static __inline __m512 __DEFAULT_FN_ATTRS512 +_mm512_maskz_loadu_ps(__mmask16 __U, void const *__P) +{ + return (__m512) __builtin_ia32_loadups512_mask ((const float *)__P, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} + +static __inline __m512d __DEFAULT_FN_ATTRS512 +_mm512_mask_loadu_pd (__m512d __W, __mmask8 __U, void const *__P) +{ + return (__m512d) __builtin_ia32_loadupd512_mask ((const double *) __P, + (__v8df) __W, + (__mmask8) __U); +} + +static __inline __m512d __DEFAULT_FN_ATTRS512 +_mm512_maskz_loadu_pd(__mmask8 __U, void const *__P) +{ + return (__m512d) __builtin_ia32_loadupd512_mask ((const double *)__P, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} + +static __inline __m512d __DEFAULT_FN_ATTRS512 +_mm512_loadu_pd(void const *__p) +{ + struct __loadu_pd { + __m512d_u __v; + } __attribute__((__packed__, __may_alias__)); + return ((const struct __loadu_pd*)__p)->__v; +} + +static __inline __m512 __DEFAULT_FN_ATTRS512 +_mm512_loadu_ps(void const *__p) +{ + struct __loadu_ps { + __m512_u __v; + } __attribute__((__packed__, __may_alias__)); + return ((const struct __loadu_ps*)__p)->__v; +} + +static __inline __m512 __DEFAULT_FN_ATTRS512 +_mm512_load_ps(void const *__p) +{ + return *(const __m512*)__p; +} + +static __inline __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask_load_ps (__m512 __W, __mmask16 __U, void const *__P) +{ + return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *) __P, + (__v16sf) __W, + (__mmask16) __U); +} + +static __inline __m512 __DEFAULT_FN_ATTRS512 +_mm512_maskz_load_ps(__mmask16 __U, void const *__P) +{ + return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *)__P, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} + +static __inline __m512d __DEFAULT_FN_ATTRS512 +_mm512_load_pd(void const *__p) +{ + return *(const __m512d*)__p; +} + +static __inline __m512d __DEFAULT_FN_ATTRS512 +_mm512_mask_load_pd (__m512d __W, __mmask8 __U, void const *__P) +{ + return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *) __P, + (__v8df) __W, + (__mmask8) __U); +} + +static __inline __m512d __DEFAULT_FN_ATTRS512 +_mm512_maskz_load_pd(__mmask8 __U, void const *__P) +{ + return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *)__P, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} + +static __inline __m512i __DEFAULT_FN_ATTRS512 +_mm512_load_si512 (void const *__P) +{ + return *(const __m512i *) __P; +} + +static __inline __m512i __DEFAULT_FN_ATTRS512 +_mm512_load_epi32 (void const *__P) +{ + return *(const __m512i *) __P; +} + +static __inline __m512i __DEFAULT_FN_ATTRS512 +_mm512_load_epi64 (void const *__P) +{ + return *(const __m512i *) __P; +} + +/* SIMD store ops */ + +static __inline void __DEFAULT_FN_ATTRS512 +_mm512_storeu_epi64 (void *__P, __m512i __A) +{ + struct __storeu_epi64 { + __m512i_u __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_epi64*)__P)->__v = __A; +} + +static __inline void __DEFAULT_FN_ATTRS512 +_mm512_mask_storeu_epi64(void *__P, __mmask8 __U, __m512i __A) +{ + __builtin_ia32_storedqudi512_mask ((long long *)__P, (__v8di) __A, + (__mmask8) __U); +} + +static __inline void __DEFAULT_FN_ATTRS512 +_mm512_storeu_si512 (void *__P, __m512i __A) +{ + struct __storeu_si512 { + __m512i_u __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_si512*)__P)->__v = __A; +} + +static __inline void __DEFAULT_FN_ATTRS512 +_mm512_storeu_epi32 (void *__P, __m512i __A) +{ + struct __storeu_epi32 { + __m512i_u __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_epi32*)__P)->__v = __A; +} + +static __inline void __DEFAULT_FN_ATTRS512 +_mm512_mask_storeu_epi32(void *__P, __mmask16 __U, __m512i __A) +{ + __builtin_ia32_storedqusi512_mask ((int *)__P, (__v16si) __A, + (__mmask16) __U); +} + +static __inline void __DEFAULT_FN_ATTRS512 +_mm512_mask_storeu_pd(void *__P, __mmask8 __U, __m512d __A) +{ + __builtin_ia32_storeupd512_mask ((double *)__P, (__v8df) __A, (__mmask8) __U); +} + +static __inline void __DEFAULT_FN_ATTRS512 +_mm512_storeu_pd(void *__P, __m512d __A) +{ + struct __storeu_pd { + __m512d_u __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_pd*)__P)->__v = __A; +} + +static __inline void __DEFAULT_FN_ATTRS512 +_mm512_mask_storeu_ps(void *__P, __mmask16 __U, __m512 __A) +{ + __builtin_ia32_storeups512_mask ((float *)__P, (__v16sf) __A, + (__mmask16) __U); +} + +static __inline void __DEFAULT_FN_ATTRS512 +_mm512_storeu_ps(void *__P, __m512 __A) +{ + struct __storeu_ps { + __m512_u __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_ps*)__P)->__v = __A; +} + +static __inline void __DEFAULT_FN_ATTRS512 +_mm512_mask_store_pd(void *__P, __mmask8 __U, __m512d __A) +{ + __builtin_ia32_storeapd512_mask ((__v8df *)__P, (__v8df) __A, (__mmask8) __U); +} + +static __inline void __DEFAULT_FN_ATTRS512 +_mm512_store_pd(void *__P, __m512d __A) +{ + *(__m512d*)__P = __A; +} + +static __inline void __DEFAULT_FN_ATTRS512 +_mm512_mask_store_ps(void *__P, __mmask16 __U, __m512 __A) +{ + __builtin_ia32_storeaps512_mask ((__v16sf *)__P, (__v16sf) __A, + (__mmask16) __U); +} + +static __inline void __DEFAULT_FN_ATTRS512 +_mm512_store_ps(void *__P, __m512 __A) +{ + *(__m512*)__P = __A; +} + +static __inline void __DEFAULT_FN_ATTRS512 +_mm512_store_si512 (void *__P, __m512i __A) +{ + *(__m512i *) __P = __A; +} + +static __inline void __DEFAULT_FN_ATTRS512 +_mm512_store_epi32 (void *__P, __m512i __A) +{ + *(__m512i *) __P = __A; +} + +static __inline void __DEFAULT_FN_ATTRS512 +_mm512_store_epi64 (void *__P, __m512i __A) +{ + *(__m512i *) __P = __A; +} + +/* Mask ops */ + +static __inline __mmask16 __DEFAULT_FN_ATTRS +_mm512_knot(__mmask16 __M) +{ + return __builtin_ia32_knothi(__M); +} + +/* Integer compare */ + +#define _mm512_cmpeq_epi32_mask(A, B) \ + _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_EQ) +#define _mm512_mask_cmpeq_epi32_mask(k, A, B) \ + _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm512_cmpge_epi32_mask(A, B) \ + _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_GE) +#define _mm512_mask_cmpge_epi32_mask(k, A, B) \ + _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm512_cmpgt_epi32_mask(A, B) \ + _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_GT) +#define _mm512_mask_cmpgt_epi32_mask(k, A, B) \ + _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm512_cmple_epi32_mask(A, B) \ + _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_LE) +#define _mm512_mask_cmple_epi32_mask(k, A, B) \ + _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm512_cmplt_epi32_mask(A, B) \ + _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_LT) +#define _mm512_mask_cmplt_epi32_mask(k, A, B) \ + _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm512_cmpneq_epi32_mask(A, B) \ + _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_NE) +#define _mm512_mask_cmpneq_epi32_mask(k, A, B) \ + _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_NE) + +#define _mm512_cmpeq_epu32_mask(A, B) \ + _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_EQ) +#define _mm512_mask_cmpeq_epu32_mask(k, A, B) \ + _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm512_cmpge_epu32_mask(A, B) \ + _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_GE) +#define _mm512_mask_cmpge_epu32_mask(k, A, B) \ + _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm512_cmpgt_epu32_mask(A, B) \ + _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_GT) +#define _mm512_mask_cmpgt_epu32_mask(k, A, B) \ + _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm512_cmple_epu32_mask(A, B) \ + _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_LE) +#define _mm512_mask_cmple_epu32_mask(k, A, B) \ + _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm512_cmplt_epu32_mask(A, B) \ + _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_LT) +#define _mm512_mask_cmplt_epu32_mask(k, A, B) \ + _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm512_cmpneq_epu32_mask(A, B) \ + _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_NE) +#define _mm512_mask_cmpneq_epu32_mask(k, A, B) \ + _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_NE) + +#define _mm512_cmpeq_epi64_mask(A, B) \ + _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_EQ) +#define _mm512_mask_cmpeq_epi64_mask(k, A, B) \ + _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm512_cmpge_epi64_mask(A, B) \ + _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_GE) +#define _mm512_mask_cmpge_epi64_mask(k, A, B) \ + _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm512_cmpgt_epi64_mask(A, B) \ + _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_GT) +#define _mm512_mask_cmpgt_epi64_mask(k, A, B) \ + _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm512_cmple_epi64_mask(A, B) \ + _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_LE) +#define _mm512_mask_cmple_epi64_mask(k, A, B) \ + _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm512_cmplt_epi64_mask(A, B) \ + _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_LT) +#define _mm512_mask_cmplt_epi64_mask(k, A, B) \ + _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm512_cmpneq_epi64_mask(A, B) \ + _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_NE) +#define _mm512_mask_cmpneq_epi64_mask(k, A, B) \ + _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_NE) + +#define _mm512_cmpeq_epu64_mask(A, B) \ + _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_EQ) +#define _mm512_mask_cmpeq_epu64_mask(k, A, B) \ + _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm512_cmpge_epu64_mask(A, B) \ + _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_GE) +#define _mm512_mask_cmpge_epu64_mask(k, A, B) \ + _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm512_cmpgt_epu64_mask(A, B) \ + _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_GT) +#define _mm512_mask_cmpgt_epu64_mask(k, A, B) \ + _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm512_cmple_epu64_mask(A, B) \ + _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_LE) +#define _mm512_mask_cmple_epu64_mask(k, A, B) \ + _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm512_cmplt_epu64_mask(A, B) \ + _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_LT) +#define _mm512_mask_cmplt_epu64_mask(k, A, B) \ + _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm512_cmpneq_epu64_mask(A, B) \ + _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_NE) +#define _mm512_mask_cmpneq_epu64_mask(k, A, B) \ + _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_NE) + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_cvtepi8_epi32(__m128i __A) +{ + /* This function always performs a signed extension, but __v16qi is a char + which may be signed or unsigned, so use __v16qs. */ + return (__m512i)__builtin_convertvector((__v16qs)__A, __v16si); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtepi8_epi32(__m512i __W, __mmask16 __U, __m128i __A) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_cvtepi8_epi32(__A), + (__v16si)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtepi8_epi32(__mmask16 __U, __m128i __A) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_cvtepi8_epi32(__A), + (__v16si)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_cvtepi8_epi64(__m128i __A) +{ + /* This function always performs a signed extension, but __v16qi is a char + which may be signed or unsigned, so use __v16qs. */ + return (__m512i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__A, (__v16qs)__A, 0, 1, 2, 3, 4, 5, 6, 7), __v8di); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtepi8_epi64(__m512i __W, __mmask8 __U, __m128i __A) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_cvtepi8_epi64(__A), + (__v8di)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_cvtepi8_epi64(__A), + (__v8di)_mm512_setzero_si512 ()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_cvtepi32_epi64(__m256i __X) +{ + return (__m512i)__builtin_convertvector((__v8si)__X, __v8di); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtepi32_epi64(__m512i __W, __mmask8 __U, __m256i __X) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_cvtepi32_epi64(__X), + (__v8di)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtepi32_epi64(__mmask8 __U, __m256i __X) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_cvtepi32_epi64(__X), + (__v8di)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_cvtepi16_epi32(__m256i __A) +{ + return (__m512i)__builtin_convertvector((__v16hi)__A, __v16si); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtepi16_epi32(__m512i __W, __mmask16 __U, __m256i __A) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_cvtepi16_epi32(__A), + (__v16si)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtepi16_epi32(__mmask16 __U, __m256i __A) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_cvtepi16_epi32(__A), + (__v16si)_mm512_setzero_si512 ()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_cvtepi16_epi64(__m128i __A) +{ + return (__m512i)__builtin_convertvector((__v8hi)__A, __v8di); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtepi16_epi64(__m512i __W, __mmask8 __U, __m128i __A) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_cvtepi16_epi64(__A), + (__v8di)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_cvtepi16_epi64(__A), + (__v8di)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_cvtepu8_epi32(__m128i __A) +{ + return (__m512i)__builtin_convertvector((__v16qu)__A, __v16si); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtepu8_epi32(__m512i __W, __mmask16 __U, __m128i __A) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_cvtepu8_epi32(__A), + (__v16si)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtepu8_epi32(__mmask16 __U, __m128i __A) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_cvtepu8_epi32(__A), + (__v16si)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_cvtepu8_epi64(__m128i __A) +{ + return (__m512i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__A, (__v16qu)__A, 0, 1, 2, 3, 4, 5, 6, 7), __v8di); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtepu8_epi64(__m512i __W, __mmask8 __U, __m128i __A) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_cvtepu8_epi64(__A), + (__v8di)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_cvtepu8_epi64(__A), + (__v8di)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_cvtepu32_epi64(__m256i __X) +{ + return (__m512i)__builtin_convertvector((__v8su)__X, __v8di); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtepu32_epi64(__m512i __W, __mmask8 __U, __m256i __X) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_cvtepu32_epi64(__X), + (__v8di)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtepu32_epi64(__mmask8 __U, __m256i __X) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_cvtepu32_epi64(__X), + (__v8di)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_cvtepu16_epi32(__m256i __A) +{ + return (__m512i)__builtin_convertvector((__v16hu)__A, __v16si); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtepu16_epi32(__m512i __W, __mmask16 __U, __m256i __A) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_cvtepu16_epi32(__A), + (__v16si)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtepu16_epi32(__mmask16 __U, __m256i __A) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_cvtepu16_epi32(__A), + (__v16si)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_cvtepu16_epi64(__m128i __A) +{ + return (__m512i)__builtin_convertvector((__v8hu)__A, __v8di); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtepu16_epi64(__m512i __W, __mmask8 __U, __m128i __A) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_cvtepu16_epi64(__A), + (__v8di)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_cvtepu16_epi64(__A), + (__v8di)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_rorv_epi32 (__m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_prorvd512((__v16si)__A, (__v16si)__B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_rorv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectd_512(__U, + (__v16si)_mm512_rorv_epi32(__A, __B), + (__v16si)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_rorv_epi32 (__mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectd_512(__U, + (__v16si)_mm512_rorv_epi32(__A, __B), + (__v16si)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_rorv_epi64 (__m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_prorvq512((__v8di)__A, (__v8di)__B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_rorv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectq_512(__U, + (__v8di)_mm512_rorv_epi64(__A, __B), + (__v8di)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_rorv_epi64 (__mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectq_512(__U, + (__v8di)_mm512_rorv_epi64(__A, __B), + (__v8di)_mm512_setzero_si512()); +} + + + +#define _mm512_cmp_epi32_mask(a, b, p) \ + ((__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \ + (__v16si)(__m512i)(b), (int)(p), \ + (__mmask16)-1)) + +#define _mm512_cmp_epu32_mask(a, b, p) \ + ((__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)(__m512i)(a), \ + (__v16si)(__m512i)(b), (int)(p), \ + (__mmask16)-1)) + +#define _mm512_cmp_epi64_mask(a, b, p) \ + ((__mmask8)__builtin_ia32_cmpq512_mask((__v8di)(__m512i)(a), \ + (__v8di)(__m512i)(b), (int)(p), \ + (__mmask8)-1)) + +#define _mm512_cmp_epu64_mask(a, b, p) \ + ((__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)(__m512i)(a), \ + (__v8di)(__m512i)(b), (int)(p), \ + (__mmask8)-1)) + +#define _mm512_mask_cmp_epi32_mask(m, a, b, p) \ + ((__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \ + (__v16si)(__m512i)(b), (int)(p), \ + (__mmask16)(m))) + +#define _mm512_mask_cmp_epu32_mask(m, a, b, p) \ + ((__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)(__m512i)(a), \ + (__v16si)(__m512i)(b), (int)(p), \ + (__mmask16)(m))) + +#define _mm512_mask_cmp_epi64_mask(m, a, b, p) \ + ((__mmask8)__builtin_ia32_cmpq512_mask((__v8di)(__m512i)(a), \ + (__v8di)(__m512i)(b), (int)(p), \ + (__mmask8)(m))) + +#define _mm512_mask_cmp_epu64_mask(m, a, b, p) \ + ((__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)(__m512i)(a), \ + (__v8di)(__m512i)(b), (int)(p), \ + (__mmask8)(m))) + +#define _mm512_rol_epi32(a, b) \ + ((__m512i)__builtin_ia32_prold512((__v16si)(__m512i)(a), (int)(b))) + +#define _mm512_mask_rol_epi32(W, U, a, b) \ + ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ + (__v16si)_mm512_rol_epi32((a), (b)), \ + (__v16si)(__m512i)(W))) + +#define _mm512_maskz_rol_epi32(U, a, b) \ + ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ + (__v16si)_mm512_rol_epi32((a), (b)), \ + (__v16si)_mm512_setzero_si512())) + +#define _mm512_rol_epi64(a, b) \ + ((__m512i)__builtin_ia32_prolq512((__v8di)(__m512i)(a), (int)(b))) + +#define _mm512_mask_rol_epi64(W, U, a, b) \ + ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ + (__v8di)_mm512_rol_epi64((a), (b)), \ + (__v8di)(__m512i)(W))) + +#define _mm512_maskz_rol_epi64(U, a, b) \ + ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ + (__v8di)_mm512_rol_epi64((a), (b)), \ + (__v8di)_mm512_setzero_si512())) + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_rolv_epi32 (__m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_prolvd512((__v16si)__A, (__v16si)__B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_rolv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectd_512(__U, + (__v16si)_mm512_rolv_epi32(__A, __B), + (__v16si)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_rolv_epi32 (__mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectd_512(__U, + (__v16si)_mm512_rolv_epi32(__A, __B), + (__v16si)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_rolv_epi64 (__m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_prolvq512((__v8di)__A, (__v8di)__B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_rolv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectq_512(__U, + (__v8di)_mm512_rolv_epi64(__A, __B), + (__v8di)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_rolv_epi64 (__mmask8 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectq_512(__U, + (__v8di)_mm512_rolv_epi64(__A, __B), + (__v8di)_mm512_setzero_si512()); +} + +#define _mm512_ror_epi32(A, B) \ + ((__m512i)__builtin_ia32_prord512((__v16si)(__m512i)(A), (int)(B))) + +#define _mm512_mask_ror_epi32(W, U, A, B) \ + ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ + (__v16si)_mm512_ror_epi32((A), (B)), \ + (__v16si)(__m512i)(W))) + +#define _mm512_maskz_ror_epi32(U, A, B) \ + ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ + (__v16si)_mm512_ror_epi32((A), (B)), \ + (__v16si)_mm512_setzero_si512())) + +#define _mm512_ror_epi64(A, B) \ + ((__m512i)__builtin_ia32_prorq512((__v8di)(__m512i)(A), (int)(B))) + +#define _mm512_mask_ror_epi64(W, U, A, B) \ + ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ + (__v8di)_mm512_ror_epi64((A), (B)), \ + (__v8di)(__m512i)(W))) + +#define _mm512_maskz_ror_epi64(U, A, B) \ + ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ + (__v8di)_mm512_ror_epi64((A), (B)), \ + (__v8di)_mm512_setzero_si512())) + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_slli_epi32(__m512i __A, unsigned int __B) +{ + return (__m512i)__builtin_ia32_pslldi512((__v16si)__A, __B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_slli_epi32(__m512i __W, __mmask16 __U, __m512i __A, + unsigned int __B) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_slli_epi32(__A, __B), + (__v16si)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_slli_epi32(__mmask16 __U, __m512i __A, unsigned int __B) { + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_slli_epi32(__A, __B), + (__v16si)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_slli_epi64(__m512i __A, unsigned int __B) +{ + return (__m512i)__builtin_ia32_psllqi512((__v8di)__A, __B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_slli_epi64(__m512i __W, __mmask8 __U, __m512i __A, unsigned int __B) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_slli_epi64(__A, __B), + (__v8di)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_slli_epi64(__mmask8 __U, __m512i __A, unsigned int __B) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_slli_epi64(__A, __B), + (__v8di)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_srli_epi32(__m512i __A, unsigned int __B) +{ + return (__m512i)__builtin_ia32_psrldi512((__v16si)__A, __B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_srli_epi32(__m512i __W, __mmask16 __U, __m512i __A, + unsigned int __B) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_srli_epi32(__A, __B), + (__v16si)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_srli_epi32(__mmask16 __U, __m512i __A, unsigned int __B) { + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_srli_epi32(__A, __B), + (__v16si)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_srli_epi64(__m512i __A, unsigned int __B) +{ + return (__m512i)__builtin_ia32_psrlqi512((__v8di)__A, __B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_srli_epi64(__m512i __W, __mmask8 __U, __m512i __A, + unsigned int __B) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_srli_epi64(__A, __B), + (__v8di)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_srli_epi64(__mmask8 __U, __m512i __A, + unsigned int __B) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_srli_epi64(__A, __B), + (__v8di)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_load_epi32 (__m512i __W, __mmask16 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_movdqa32load512_mask ((const __v16si *) __P, + (__v16si) __W, + (__mmask16) __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_load_epi32 (__mmask16 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_movdqa32load512_mask ((const __v16si *) __P, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +static __inline__ void __DEFAULT_FN_ATTRS512 +_mm512_mask_store_epi32 (void *__P, __mmask16 __U, __m512i __A) +{ + __builtin_ia32_movdqa32store512_mask ((__v16si *) __P, (__v16si) __A, + (__mmask16) __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_mov_epi32 (__m512i __W, __mmask16 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U, + (__v16si) __A, + (__v16si) __W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_mov_epi32 (__mmask16 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U, + (__v16si) __A, + (__v16si) _mm512_setzero_si512 ()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_mov_epi64 (__m512i __W, __mmask8 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U, + (__v8di) __A, + (__v8di) __W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_mov_epi64 (__mmask8 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U, + (__v8di) __A, + (__v8di) _mm512_setzero_si512 ()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_load_epi64 (__m512i __W, __mmask8 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_movdqa64load512_mask ((const __v8di *) __P, + (__v8di) __W, + (__mmask8) __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_load_epi64 (__mmask8 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_movdqa64load512_mask ((const __v8di *) __P, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +static __inline__ void __DEFAULT_FN_ATTRS512 +_mm512_mask_store_epi64 (void *__P, __mmask8 __U, __m512i __A) +{ + __builtin_ia32_movdqa64store512_mask ((__v8di *) __P, (__v8di) __A, + (__mmask8) __U); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_movedup_pd (__m512d __A) +{ + return (__m512d)__builtin_shufflevector((__v8df)__A, (__v8df)__A, + 0, 0, 2, 2, 4, 4, 6, 6); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_mask_movedup_pd (__m512d __W, __mmask8 __U, __m512d __A) +{ + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_movedup_pd(__A), + (__v8df)__W); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_maskz_movedup_pd (__mmask8 __U, __m512d __A) +{ + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_movedup_pd(__A), + (__v8df)_mm512_setzero_pd()); +} + +#define _mm512_fixupimm_round_pd(A, B, C, imm, R) \ + ((__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8di)(__m512i)(C), (int)(imm), \ + (__mmask8)-1, (int)(R))) + +#define _mm512_mask_fixupimm_round_pd(A, U, B, C, imm, R) \ + ((__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8di)(__m512i)(C), (int)(imm), \ + (__mmask8)(U), (int)(R))) + +#define _mm512_fixupimm_pd(A, B, C, imm) \ + ((__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8di)(__m512i)(C), (int)(imm), \ + (__mmask8)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_fixupimm_pd(A, U, B, C, imm) \ + ((__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8di)(__m512i)(C), (int)(imm), \ + (__mmask8)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_maskz_fixupimm_round_pd(U, A, B, C, imm, R) \ + ((__m512d)__builtin_ia32_fixupimmpd512_maskz((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8di)(__m512i)(C), \ + (int)(imm), (__mmask8)(U), \ + (int)(R))) + +#define _mm512_maskz_fixupimm_pd(U, A, B, C, imm) \ + ((__m512d)__builtin_ia32_fixupimmpd512_maskz((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8di)(__m512i)(C), \ + (int)(imm), (__mmask8)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_fixupimm_round_ps(A, B, C, imm, R) \ + ((__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16si)(__m512i)(C), (int)(imm), \ + (__mmask16)-1, (int)(R))) + +#define _mm512_mask_fixupimm_round_ps(A, U, B, C, imm, R) \ + ((__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16si)(__m512i)(C), (int)(imm), \ + (__mmask16)(U), (int)(R))) + +#define _mm512_fixupimm_ps(A, B, C, imm) \ + ((__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16si)(__m512i)(C), (int)(imm), \ + (__mmask16)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_fixupimm_ps(A, U, B, C, imm) \ + ((__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16si)(__m512i)(C), (int)(imm), \ + (__mmask16)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_maskz_fixupimm_round_ps(U, A, B, C, imm, R) \ + ((__m512)__builtin_ia32_fixupimmps512_maskz((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16si)(__m512i)(C), \ + (int)(imm), (__mmask16)(U), \ + (int)(R))) + +#define _mm512_maskz_fixupimm_ps(U, A, B, C, imm) \ + ((__m512)__builtin_ia32_fixupimmps512_maskz((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16si)(__m512i)(C), \ + (int)(imm), (__mmask16)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_fixupimm_round_sd(A, B, C, imm, R) \ + ((__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2di)(__m128i)(C), (int)(imm), \ + (__mmask8)-1, (int)(R))) + +#define _mm_mask_fixupimm_round_sd(A, U, B, C, imm, R) \ + ((__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2di)(__m128i)(C), (int)(imm), \ + (__mmask8)(U), (int)(R))) + +#define _mm_fixupimm_sd(A, B, C, imm) \ + ((__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2di)(__m128i)(C), (int)(imm), \ + (__mmask8)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_fixupimm_sd(A, U, B, C, imm) \ + ((__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2di)(__m128i)(C), (int)(imm), \ + (__mmask8)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_maskz_fixupimm_round_sd(U, A, B, C, imm, R) \ + ((__m128d)__builtin_ia32_fixupimmsd_maskz((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2di)(__m128i)(C), (int)(imm), \ + (__mmask8)(U), (int)(R))) + +#define _mm_maskz_fixupimm_sd(U, A, B, C, imm) \ + ((__m128d)__builtin_ia32_fixupimmsd_maskz((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2di)(__m128i)(C), (int)(imm), \ + (__mmask8)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_fixupimm_round_ss(A, B, C, imm, R) \ + ((__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4si)(__m128i)(C), (int)(imm), \ + (__mmask8)-1, (int)(R))) + +#define _mm_mask_fixupimm_round_ss(A, U, B, C, imm, R) \ + ((__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4si)(__m128i)(C), (int)(imm), \ + (__mmask8)(U), (int)(R))) + +#define _mm_fixupimm_ss(A, B, C, imm) \ + ((__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4si)(__m128i)(C), (int)(imm), \ + (__mmask8)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_fixupimm_ss(A, U, B, C, imm) \ + ((__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4si)(__m128i)(C), (int)(imm), \ + (__mmask8)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_maskz_fixupimm_round_ss(U, A, B, C, imm, R) \ + ((__m128)__builtin_ia32_fixupimmss_maskz((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4si)(__m128i)(C), (int)(imm), \ + (__mmask8)(U), (int)(R))) + +#define _mm_maskz_fixupimm_ss(U, A, B, C, imm) \ + ((__m128)__builtin_ia32_fixupimmss_maskz((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4si)(__m128i)(C), (int)(imm), \ + (__mmask8)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_getexp_round_sd(A, B, R) \ + ((__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)-1, (int)(R))) + + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_getexp_sd (__m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_getexpsd128_round_mask ((__v2df) __A, + (__v2df) __B, (__v2df) _mm_setzero_pd(), (__mmask8) -1, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask_getexp_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_getexpsd128_round_mask ( (__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_mask_getexp_round_sd(W, U, A, B, R) \ + ((__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)(__m128d)(W), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_maskz_getexp_sd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_getexpsd128_round_mask ( (__v2df) __A, + (__v2df) __B, + (__v2df) _mm_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_maskz_getexp_round_sd(U, A, B, R) \ + ((__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(U), (int)(R))) + +#define _mm_getexp_round_ss(A, B, R) \ + ((__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)-1, (int)(R))) + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_getexp_ss (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A, + (__v4sf) __B, (__v4sf) _mm_setzero_ps(), (__mmask8) -1, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask_getexp_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_mask_getexp_round_ss(W, U, A, B, R) \ + ((__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)(__m128)(W), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_getexp_ss (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) _mm_setzero_ps (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_maskz_getexp_round_ss(U, A, B, R) \ + ((__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(U), (int)(R))) + +#define _mm_getmant_round_sd(A, B, C, D, R) \ + ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (int)(((D)<<2) | (C)), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)-1, (int)(R))) + +#define _mm_getmant_sd(A, B, C, D) \ + ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (int)(((D)<<2) | (C)), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_getmant_sd(W, U, A, B, C, D) \ + ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (int)(((D)<<2) | (C)), \ + (__v2df)(__m128d)(W), \ + (__mmask8)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_getmant_round_sd(W, U, A, B, C, D, R) \ + ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (int)(((D)<<2) | (C)), \ + (__v2df)(__m128d)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm_maskz_getmant_sd(U, A, B, C, D) \ + ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (int)(((D)<<2) | (C)), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_maskz_getmant_round_sd(U, A, B, C, D, R) \ + ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (int)(((D)<<2) | (C)), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(U), (int)(R))) + +#define _mm_getmant_round_ss(A, B, C, D, R) \ + ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (int)(((D)<<2) | (C)), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)-1, (int)(R))) + +#define _mm_getmant_ss(A, B, C, D) \ + ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (int)(((D)<<2) | (C)), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_getmant_ss(W, U, A, B, C, D) \ + ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (int)(((D)<<2) | (C)), \ + (__v4sf)(__m128)(W), \ + (__mmask8)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_getmant_round_ss(W, U, A, B, C, D, R) \ + ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (int)(((D)<<2) | (C)), \ + (__v4sf)(__m128)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm_maskz_getmant_ss(U, A, B, C, D) \ + ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (int)(((D)<<2) | (C)), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_maskz_getmant_round_ss(U, A, B, C, D, R) \ + ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (int)(((D)<<2) | (C)), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __mmask16 __DEFAULT_FN_ATTRS +_mm512_kmov (__mmask16 __A) +{ + return __A; +} + +#define _mm_comi_round_sd(A, B, P, R) \ + ((int)__builtin_ia32_vcomisd((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), \ + (int)(P), (int)(R))) + +#define _mm_comi_round_ss(A, B, P, R) \ + ((int)__builtin_ia32_vcomiss((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), \ + (int)(P), (int)(R))) + +#ifdef __x86_64__ +#define _mm_cvt_roundsd_si64(A, R) \ + ((long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R))) +#endif + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_sll_epi32(__m512i __A, __m128i __B) +{ + return (__m512i)__builtin_ia32_pslld512((__v16si) __A, (__v4si)__B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_sll_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_sll_epi32(__A, __B), + (__v16si)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_sll_epi32(__mmask16 __U, __m512i __A, __m128i __B) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_sll_epi32(__A, __B), + (__v16si)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_sll_epi64(__m512i __A, __m128i __B) +{ + return (__m512i)__builtin_ia32_psllq512((__v8di)__A, (__v2di)__B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_sll_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_sll_epi64(__A, __B), + (__v8di)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_sll_epi64(__mmask8 __U, __m512i __A, __m128i __B) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_sll_epi64(__A, __B), + (__v8di)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_sllv_epi32(__m512i __X, __m512i __Y) +{ + return (__m512i)__builtin_ia32_psllv16si((__v16si)__X, (__v16si)__Y); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_sllv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_sllv_epi32(__X, __Y), + (__v16si)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_sllv_epi32(__mmask16 __U, __m512i __X, __m512i __Y) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_sllv_epi32(__X, __Y), + (__v16si)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_sllv_epi64(__m512i __X, __m512i __Y) +{ + return (__m512i)__builtin_ia32_psllv8di((__v8di)__X, (__v8di)__Y); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_sllv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_sllv_epi64(__X, __Y), + (__v8di)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_sllv_epi64(__mmask8 __U, __m512i __X, __m512i __Y) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_sllv_epi64(__X, __Y), + (__v8di)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_sra_epi32(__m512i __A, __m128i __B) +{ + return (__m512i)__builtin_ia32_psrad512((__v16si) __A, (__v4si)__B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_sra_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_sra_epi32(__A, __B), + (__v16si)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_sra_epi32(__mmask16 __U, __m512i __A, __m128i __B) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_sra_epi32(__A, __B), + (__v16si)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_sra_epi64(__m512i __A, __m128i __B) +{ + return (__m512i)__builtin_ia32_psraq512((__v8di)__A, (__v2di)__B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_sra_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_sra_epi64(__A, __B), + (__v8di)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_sra_epi64(__mmask8 __U, __m512i __A, __m128i __B) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_sra_epi64(__A, __B), + (__v8di)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_srav_epi32(__m512i __X, __m512i __Y) +{ + return (__m512i)__builtin_ia32_psrav16si((__v16si)__X, (__v16si)__Y); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_srav_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_srav_epi32(__X, __Y), + (__v16si)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_srav_epi32(__mmask16 __U, __m512i __X, __m512i __Y) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_srav_epi32(__X, __Y), + (__v16si)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_srav_epi64(__m512i __X, __m512i __Y) +{ + return (__m512i)__builtin_ia32_psrav8di((__v8di)__X, (__v8di)__Y); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_srav_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_srav_epi64(__X, __Y), + (__v8di)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_srav_epi64(__mmask8 __U, __m512i __X, __m512i __Y) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_srav_epi64(__X, __Y), + (__v8di)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_srl_epi32(__m512i __A, __m128i __B) +{ + return (__m512i)__builtin_ia32_psrld512((__v16si) __A, (__v4si)__B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_srl_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_srl_epi32(__A, __B), + (__v16si)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_srl_epi32(__mmask16 __U, __m512i __A, __m128i __B) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_srl_epi32(__A, __B), + (__v16si)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_srl_epi64(__m512i __A, __m128i __B) +{ + return (__m512i)__builtin_ia32_psrlq512((__v8di)__A, (__v2di)__B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_srl_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_srl_epi64(__A, __B), + (__v8di)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_srl_epi64(__mmask8 __U, __m512i __A, __m128i __B) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_srl_epi64(__A, __B), + (__v8di)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_srlv_epi32(__m512i __X, __m512i __Y) +{ + return (__m512i)__builtin_ia32_psrlv16si((__v16si)__X, (__v16si)__Y); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_srlv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_srlv_epi32(__X, __Y), + (__v16si)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_srlv_epi32(__mmask16 __U, __m512i __X, __m512i __Y) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_srlv_epi32(__X, __Y), + (__v16si)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_srlv_epi64 (__m512i __X, __m512i __Y) +{ + return (__m512i)__builtin_ia32_psrlv8di((__v8di)__X, (__v8di)__Y); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_srlv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_srlv_epi64(__X, __Y), + (__v8di)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_srlv_epi64(__mmask8 __U, __m512i __X, __m512i __Y) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_srlv_epi64(__X, __Y), + (__v8di)_mm512_setzero_si512()); +} + +#define _mm512_ternarylogic_epi32(A, B, C, imm) \ + ((__m512i)__builtin_ia32_pternlogd512_mask((__v16si)(__m512i)(A), \ + (__v16si)(__m512i)(B), \ + (__v16si)(__m512i)(C), (int)(imm), \ + (__mmask16)-1)) + +#define _mm512_mask_ternarylogic_epi32(A, U, B, C, imm) \ + ((__m512i)__builtin_ia32_pternlogd512_mask((__v16si)(__m512i)(A), \ + (__v16si)(__m512i)(B), \ + (__v16si)(__m512i)(C), (int)(imm), \ + (__mmask16)(U))) + +#define _mm512_maskz_ternarylogic_epi32(U, A, B, C, imm) \ + ((__m512i)__builtin_ia32_pternlogd512_maskz((__v16si)(__m512i)(A), \ + (__v16si)(__m512i)(B), \ + (__v16si)(__m512i)(C), \ + (int)(imm), (__mmask16)(U))) + +#define _mm512_ternarylogic_epi64(A, B, C, imm) \ + ((__m512i)__builtin_ia32_pternlogq512_mask((__v8di)(__m512i)(A), \ + (__v8di)(__m512i)(B), \ + (__v8di)(__m512i)(C), (int)(imm), \ + (__mmask8)-1)) + +#define _mm512_mask_ternarylogic_epi64(A, U, B, C, imm) \ + ((__m512i)__builtin_ia32_pternlogq512_mask((__v8di)(__m512i)(A), \ + (__v8di)(__m512i)(B), \ + (__v8di)(__m512i)(C), (int)(imm), \ + (__mmask8)(U))) + +#define _mm512_maskz_ternarylogic_epi64(U, A, B, C, imm) \ + ((__m512i)__builtin_ia32_pternlogq512_maskz((__v8di)(__m512i)(A), \ + (__v8di)(__m512i)(B), \ + (__v8di)(__m512i)(C), (int)(imm), \ + (__mmask8)(U))) + +#ifdef __x86_64__ +#define _mm_cvt_roundsd_i64(A, R) \ + ((long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R))) +#endif + +#define _mm_cvt_roundsd_si32(A, R) \ + ((int)__builtin_ia32_vcvtsd2si32((__v2df)(__m128d)(A), (int)(R))) + +#define _mm_cvt_roundsd_i32(A, R) \ + ((int)__builtin_ia32_vcvtsd2si32((__v2df)(__m128d)(A), (int)(R))) + +#define _mm_cvt_roundsd_u32(A, R) \ + ((unsigned int)__builtin_ia32_vcvtsd2usi32((__v2df)(__m128d)(A), (int)(R))) + +static __inline__ unsigned __DEFAULT_FN_ATTRS128 +_mm_cvtsd_u32 (__m128d __A) +{ + return (unsigned) __builtin_ia32_vcvtsd2usi32 ((__v2df) __A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __x86_64__ +#define _mm_cvt_roundsd_u64(A, R) \ + ((unsigned long long)__builtin_ia32_vcvtsd2usi64((__v2df)(__m128d)(A), \ + (int)(R))) + +static __inline__ unsigned long long __DEFAULT_FN_ATTRS128 +_mm_cvtsd_u64 (__m128d __A) +{ + return (unsigned long long) __builtin_ia32_vcvtsd2usi64 ((__v2df) + __A, + _MM_FROUND_CUR_DIRECTION); +} +#endif + +#define _mm_cvt_roundss_si32(A, R) \ + ((int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R))) + +#define _mm_cvt_roundss_i32(A, R) \ + ((int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R))) + +#ifdef __x86_64__ +#define _mm_cvt_roundss_si64(A, R) \ + ((long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R))) + +#define _mm_cvt_roundss_i64(A, R) \ + ((long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R))) +#endif + +#define _mm_cvt_roundss_u32(A, R) \ + ((unsigned int)__builtin_ia32_vcvtss2usi32((__v4sf)(__m128)(A), (int)(R))) + +static __inline__ unsigned __DEFAULT_FN_ATTRS128 +_mm_cvtss_u32 (__m128 __A) +{ + return (unsigned) __builtin_ia32_vcvtss2usi32 ((__v4sf) __A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __x86_64__ +#define _mm_cvt_roundss_u64(A, R) \ + ((unsigned long long)__builtin_ia32_vcvtss2usi64((__v4sf)(__m128)(A), \ + (int)(R))) + +static __inline__ unsigned long long __DEFAULT_FN_ATTRS128 +_mm_cvtss_u64 (__m128 __A) +{ + return (unsigned long long) __builtin_ia32_vcvtss2usi64 ((__v4sf) + __A, + _MM_FROUND_CUR_DIRECTION); +} +#endif + +#define _mm_cvtt_roundsd_i32(A, R) \ + ((int)__builtin_ia32_vcvttsd2si32((__v2df)(__m128d)(A), (int)(R))) + +#define _mm_cvtt_roundsd_si32(A, R) \ + ((int)__builtin_ia32_vcvttsd2si32((__v2df)(__m128d)(A), (int)(R))) + +static __inline__ int __DEFAULT_FN_ATTRS128 +_mm_cvttsd_i32 (__m128d __A) +{ + return (int) __builtin_ia32_vcvttsd2si32 ((__v2df) __A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __x86_64__ +#define _mm_cvtt_roundsd_si64(A, R) \ + ((long long)__builtin_ia32_vcvttsd2si64((__v2df)(__m128d)(A), (int)(R))) + +#define _mm_cvtt_roundsd_i64(A, R) \ + ((long long)__builtin_ia32_vcvttsd2si64((__v2df)(__m128d)(A), (int)(R))) + +static __inline__ long long __DEFAULT_FN_ATTRS128 +_mm_cvttsd_i64 (__m128d __A) +{ + return (long long) __builtin_ia32_vcvttsd2si64 ((__v2df) __A, + _MM_FROUND_CUR_DIRECTION); +} +#endif + +#define _mm_cvtt_roundsd_u32(A, R) \ + ((unsigned int)__builtin_ia32_vcvttsd2usi32((__v2df)(__m128d)(A), (int)(R))) + +static __inline__ unsigned __DEFAULT_FN_ATTRS128 +_mm_cvttsd_u32 (__m128d __A) +{ + return (unsigned) __builtin_ia32_vcvttsd2usi32 ((__v2df) __A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __x86_64__ +#define _mm_cvtt_roundsd_u64(A, R) \ + ((unsigned long long)__builtin_ia32_vcvttsd2usi64((__v2df)(__m128d)(A), \ + (int)(R))) + +static __inline__ unsigned long long __DEFAULT_FN_ATTRS128 +_mm_cvttsd_u64 (__m128d __A) +{ + return (unsigned long long) __builtin_ia32_vcvttsd2usi64 ((__v2df) + __A, + _MM_FROUND_CUR_DIRECTION); +} +#endif + +#define _mm_cvtt_roundss_i32(A, R) \ + ((int)__builtin_ia32_vcvttss2si32((__v4sf)(__m128)(A), (int)(R))) + +#define _mm_cvtt_roundss_si32(A, R) \ + ((int)__builtin_ia32_vcvttss2si32((__v4sf)(__m128)(A), (int)(R))) + +static __inline__ int __DEFAULT_FN_ATTRS128 +_mm_cvttss_i32 (__m128 __A) +{ + return (int) __builtin_ia32_vcvttss2si32 ((__v4sf) __A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __x86_64__ +#define _mm_cvtt_roundss_i64(A, R) \ + ((long long)__builtin_ia32_vcvttss2si64((__v4sf)(__m128)(A), (int)(R))) + +#define _mm_cvtt_roundss_si64(A, R) \ + ((long long)__builtin_ia32_vcvttss2si64((__v4sf)(__m128)(A), (int)(R))) + +static __inline__ long long __DEFAULT_FN_ATTRS128 +_mm_cvttss_i64 (__m128 __A) +{ + return (long long) __builtin_ia32_vcvttss2si64 ((__v4sf) __A, + _MM_FROUND_CUR_DIRECTION); +} +#endif + +#define _mm_cvtt_roundss_u32(A, R) \ + ((unsigned int)__builtin_ia32_vcvttss2usi32((__v4sf)(__m128)(A), (int)(R))) + +static __inline__ unsigned __DEFAULT_FN_ATTRS128 +_mm_cvttss_u32 (__m128 __A) +{ + return (unsigned) __builtin_ia32_vcvttss2usi32 ((__v4sf) __A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __x86_64__ +#define _mm_cvtt_roundss_u64(A, R) \ + ((unsigned long long)__builtin_ia32_vcvttss2usi64((__v4sf)(__m128)(A), \ + (int)(R))) + +static __inline__ unsigned long long __DEFAULT_FN_ATTRS128 +_mm_cvttss_u64 (__m128 __A) +{ + return (unsigned long long) __builtin_ia32_vcvttss2usi64 ((__v4sf) + __A, + _MM_FROUND_CUR_DIRECTION); +} +#endif + +#define _mm512_permute_pd(X, C) \ + ((__m512d)__builtin_ia32_vpermilpd512((__v8df)(__m512d)(X), (int)(C))) + +#define _mm512_mask_permute_pd(W, U, X, C) \ + ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__v8df)_mm512_permute_pd((X), (C)), \ + (__v8df)(__m512d)(W))) + +#define _mm512_maskz_permute_pd(U, X, C) \ + ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__v8df)_mm512_permute_pd((X), (C)), \ + (__v8df)_mm512_setzero_pd())) + +#define _mm512_permute_ps(X, C) \ + ((__m512)__builtin_ia32_vpermilps512((__v16sf)(__m512)(X), (int)(C))) + +#define _mm512_mask_permute_ps(W, U, X, C) \ + ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + (__v16sf)_mm512_permute_ps((X), (C)), \ + (__v16sf)(__m512)(W))) + +#define _mm512_maskz_permute_ps(U, X, C) \ + ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + (__v16sf)_mm512_permute_ps((X), (C)), \ + (__v16sf)_mm512_setzero_ps())) + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_permutevar_pd(__m512d __A, __m512i __C) +{ + return (__m512d)__builtin_ia32_vpermilvarpd512((__v8df)__A, (__v8di)__C); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_mask_permutevar_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512i __C) +{ + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_permutevar_pd(__A, __C), + (__v8df)__W); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_maskz_permutevar_pd(__mmask8 __U, __m512d __A, __m512i __C) +{ + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_permutevar_pd(__A, __C), + (__v8df)_mm512_setzero_pd()); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_permutevar_ps(__m512 __A, __m512i __C) +{ + return (__m512)__builtin_ia32_vpermilvarps512((__v16sf)__A, (__v16si)__C); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask_permutevar_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512i __C) +{ + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_permutevar_ps(__A, __C), + (__v16sf)__W); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_maskz_permutevar_ps(__mmask16 __U, __m512 __A, __m512i __C) +{ + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_permutevar_ps(__A, __C), + (__v16sf)_mm512_setzero_ps()); +} + +static __inline __m512d __DEFAULT_FN_ATTRS512 +_mm512_permutex2var_pd(__m512d __A, __m512i __I, __m512d __B) +{ + return (__m512d)__builtin_ia32_vpermi2varpd512((__v8df)__A, (__v8di)__I, + (__v8df)__B); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_mask_permutex2var_pd(__m512d __A, __mmask8 __U, __m512i __I, __m512d __B) +{ + return (__m512d)__builtin_ia32_selectpd_512(__U, + (__v8df)_mm512_permutex2var_pd(__A, __I, __B), + (__v8df)__A); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_mask2_permutex2var_pd(__m512d __A, __m512i __I, __mmask8 __U, + __m512d __B) +{ + return (__m512d)__builtin_ia32_selectpd_512(__U, + (__v8df)_mm512_permutex2var_pd(__A, __I, __B), + (__v8df)(__m512d)__I); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_maskz_permutex2var_pd(__mmask8 __U, __m512d __A, __m512i __I, + __m512d __B) +{ + return (__m512d)__builtin_ia32_selectpd_512(__U, + (__v8df)_mm512_permutex2var_pd(__A, __I, __B), + (__v8df)_mm512_setzero_pd()); +} + +static __inline __m512 __DEFAULT_FN_ATTRS512 +_mm512_permutex2var_ps(__m512 __A, __m512i __I, __m512 __B) +{ + return (__m512)__builtin_ia32_vpermi2varps512((__v16sf)__A, (__v16si)__I, + (__v16sf) __B); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask_permutex2var_ps(__m512 __A, __mmask16 __U, __m512i __I, __m512 __B) +{ + return (__m512)__builtin_ia32_selectps_512(__U, + (__v16sf)_mm512_permutex2var_ps(__A, __I, __B), + (__v16sf)__A); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask2_permutex2var_ps(__m512 __A, __m512i __I, __mmask16 __U, __m512 __B) +{ + return (__m512)__builtin_ia32_selectps_512(__U, + (__v16sf)_mm512_permutex2var_ps(__A, __I, __B), + (__v16sf)(__m512)__I); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_maskz_permutex2var_ps(__mmask16 __U, __m512 __A, __m512i __I, __m512 __B) +{ + return (__m512)__builtin_ia32_selectps_512(__U, + (__v16sf)_mm512_permutex2var_ps(__A, __I, __B), + (__v16sf)_mm512_setzero_ps()); +} + + +#define _mm512_cvtt_roundpd_epu32(A, R) \ + ((__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \ + (__v8si)_mm256_undefined_si256(), \ + (__mmask8)-1, (int)(R))) + +#define _mm512_mask_cvtt_roundpd_epu32(W, U, A, R) \ + ((__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \ + (__v8si)(__m256i)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm512_maskz_cvtt_roundpd_epu32(U, A, R) \ + ((__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \ + (__v8si)_mm256_setzero_si256(), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_cvttpd_epu32 (__m512d __A) +{ + return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A, + (__v8si) + _mm256_undefined_si256 (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvttpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A) +{ + return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A, + (__v8si) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvttpd_epu32 (__mmask8 __U, __m512d __A) +{ + return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_roundscale_round_sd(A, B, imm, R) \ + ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)-1, (int)(imm), \ + (int)(R))) + +#define _mm_roundscale_sd(A, B, imm) \ + ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)-1, (int)(imm), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_roundscale_sd(W, U, A, B, imm) \ + ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)(__m128d)(W), \ + (__mmask8)(U), (int)(imm), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_roundscale_round_sd(W, U, A, B, I, R) \ + ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)(__m128d)(W), \ + (__mmask8)(U), (int)(I), \ + (int)(R))) + +#define _mm_maskz_roundscale_sd(U, A, B, I) \ + ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(U), (int)(I), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_maskz_roundscale_round_sd(U, A, B, I, R) \ + ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(U), (int)(I), \ + (int)(R))) + +#define _mm_roundscale_round_ss(A, B, imm, R) \ + ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)-1, (int)(imm), \ + (int)(R))) + +#define _mm_roundscale_ss(A, B, imm) \ + ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)-1, (int)(imm), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_roundscale_ss(W, U, A, B, I) \ + ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)(__m128)(W), \ + (__mmask8)(U), (int)(I), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_roundscale_round_ss(W, U, A, B, I, R) \ + ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)(__m128)(W), \ + (__mmask8)(U), (int)(I), \ + (int)(R))) + +#define _mm_maskz_roundscale_ss(U, A, B, I) \ + ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(U), (int)(I), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_maskz_roundscale_round_ss(U, A, B, I, R) \ + ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(U), (int)(I), \ + (int)(R))) + +#define _mm512_scalef_round_pd(A, B, R) \ + ((__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)_mm512_undefined_pd(), \ + (__mmask8)-1, (int)(R))) + +#define _mm512_mask_scalef_round_pd(W, U, A, B, R) \ + ((__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)(__m512d)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm512_maskz_scalef_round_pd(U, A, B, R) \ + ((__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_scalef_pd (__m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_undefined_pd (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_mask_scalef_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_maskz_scalef_pd (__mmask8 __U, __m512d __A, __m512d __B) +{ + return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A, + (__v8df) __B, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_scalef_round_ps(A, B, R) \ + ((__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16sf)_mm512_undefined_ps(), \ + (__mmask16)-1, (int)(R))) + +#define _mm512_mask_scalef_round_ps(W, U, A, B, R) \ + ((__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16sf)(__m512)(W), \ + (__mmask16)(U), (int)(R))) + +#define _mm512_maskz_scalef_round_ps(U, A, B, R) \ + ((__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)(U), (int)(R))) + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_scalef_ps (__m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_undefined_ps (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask_scalef_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) __W, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_maskz_scalef_ps (__mmask16 __U, __m512 __A, __m512 __B) +{ + return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A, + (__v16sf) __B, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_scalef_round_sd(A, B, R) \ + ((__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)-1, (int)(R))) + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_scalef_sd (__m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_scalefsd_round_mask ((__v2df) __A, + (__v2df)( __B), (__v2df) _mm_setzero_pd(), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask_scalef_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_scalefsd_round_mask ( (__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_mask_scalef_round_sd(W, U, A, B, R) \ + ((__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)(__m128d)(W), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_maskz_scalef_sd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_scalefsd_round_mask ( (__v2df) __A, + (__v2df) __B, + (__v2df) _mm_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_maskz_scalef_round_sd(U, A, B, R) \ + ((__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(U), (int)(R))) + +#define _mm_scalef_round_ss(A, B, R) \ + ((__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)-1, (int)(R))) + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_scalef_ss (__m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_scalefss_round_mask ((__v4sf) __A, + (__v4sf)( __B), (__v4sf) _mm_setzero_ps(), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask_scalef_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_scalefss_round_mask ( (__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_mask_scalef_round_ss(W, U, A, B, R) \ + ((__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)(__m128)(W), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_scalef_ss (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_scalefss_round_mask ( (__v4sf) __A, + (__v4sf) __B, + (__v4sf) _mm_setzero_ps (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_maskz_scalef_round_ss(U, A, B, R) \ + ((__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(U), \ + (int)(R))) + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_srai_epi32(__m512i __A, unsigned int __B) +{ + return (__m512i)__builtin_ia32_psradi512((__v16si)__A, __B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_srai_epi32(__m512i __W, __mmask16 __U, __m512i __A, + unsigned int __B) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_srai_epi32(__A, __B), + (__v16si)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_srai_epi32(__mmask16 __U, __m512i __A, + unsigned int __B) { + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, + (__v16si)_mm512_srai_epi32(__A, __B), + (__v16si)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_srai_epi64(__m512i __A, unsigned int __B) +{ + return (__m512i)__builtin_ia32_psraqi512((__v8di)__A, __B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_srai_epi64(__m512i __W, __mmask8 __U, __m512i __A, unsigned int __B) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_srai_epi64(__A, __B), + (__v8di)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_srai_epi64(__mmask8 __U, __m512i __A, unsigned int __B) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, + (__v8di)_mm512_srai_epi64(__A, __B), + (__v8di)_mm512_setzero_si512()); +} + +#define _mm512_shuffle_f32x4(A, B, imm) \ + ((__m512)__builtin_ia32_shuf_f32x4((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), (int)(imm))) + +#define _mm512_mask_shuffle_f32x4(W, U, A, B, imm) \ + ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + (__v16sf)_mm512_shuffle_f32x4((A), (B), (imm)), \ + (__v16sf)(__m512)(W))) + +#define _mm512_maskz_shuffle_f32x4(U, A, B, imm) \ + ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + (__v16sf)_mm512_shuffle_f32x4((A), (B), (imm)), \ + (__v16sf)_mm512_setzero_ps())) + +#define _mm512_shuffle_f64x2(A, B, imm) \ + ((__m512d)__builtin_ia32_shuf_f64x2((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), (int)(imm))) + +#define _mm512_mask_shuffle_f64x2(W, U, A, B, imm) \ + ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__v8df)_mm512_shuffle_f64x2((A), (B), (imm)), \ + (__v8df)(__m512d)(W))) + +#define _mm512_maskz_shuffle_f64x2(U, A, B, imm) \ + ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__v8df)_mm512_shuffle_f64x2((A), (B), (imm)), \ + (__v8df)_mm512_setzero_pd())) + +#define _mm512_shuffle_i32x4(A, B, imm) \ + ((__m512i)__builtin_ia32_shuf_i32x4((__v16si)(__m512i)(A), \ + (__v16si)(__m512i)(B), (int)(imm))) + +#define _mm512_mask_shuffle_i32x4(W, U, A, B, imm) \ + ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ + (__v16si)_mm512_shuffle_i32x4((A), (B), (imm)), \ + (__v16si)(__m512i)(W))) + +#define _mm512_maskz_shuffle_i32x4(U, A, B, imm) \ + ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ + (__v16si)_mm512_shuffle_i32x4((A), (B), (imm)), \ + (__v16si)_mm512_setzero_si512())) + +#define _mm512_shuffle_i64x2(A, B, imm) \ + ((__m512i)__builtin_ia32_shuf_i64x2((__v8di)(__m512i)(A), \ + (__v8di)(__m512i)(B), (int)(imm))) + +#define _mm512_mask_shuffle_i64x2(W, U, A, B, imm) \ + ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ + (__v8di)_mm512_shuffle_i64x2((A), (B), (imm)), \ + (__v8di)(__m512i)(W))) + +#define _mm512_maskz_shuffle_i64x2(U, A, B, imm) \ + ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ + (__v8di)_mm512_shuffle_i64x2((A), (B), (imm)), \ + (__v8di)_mm512_setzero_si512())) + +#define _mm512_shuffle_pd(A, B, M) \ + ((__m512d)__builtin_ia32_shufpd512((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(B), (int)(M))) + +#define _mm512_mask_shuffle_pd(W, U, A, B, M) \ + ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__v8df)_mm512_shuffle_pd((A), (B), (M)), \ + (__v8df)(__m512d)(W))) + +#define _mm512_maskz_shuffle_pd(U, A, B, M) \ + ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__v8df)_mm512_shuffle_pd((A), (B), (M)), \ + (__v8df)_mm512_setzero_pd())) + +#define _mm512_shuffle_ps(A, B, M) \ + ((__m512)__builtin_ia32_shufps512((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(B), (int)(M))) + +#define _mm512_mask_shuffle_ps(W, U, A, B, M) \ + ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + (__v16sf)_mm512_shuffle_ps((A), (B), (M)), \ + (__v16sf)(__m512)(W))) + +#define _mm512_maskz_shuffle_ps(U, A, B, M) \ + ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + (__v16sf)_mm512_shuffle_ps((A), (B), (M)), \ + (__v16sf)_mm512_setzero_ps())) + +#define _mm_sqrt_round_sd(A, B, R) \ + ((__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)-1, (int)(R))) + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask_sqrt_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_sqrtsd_round_mask ( (__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_mask_sqrt_round_sd(W, U, A, B, R) \ + ((__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)(__m128d)(W), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_maskz_sqrt_sd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d) __builtin_ia32_sqrtsd_round_mask ( (__v2df) __A, + (__v2df) __B, + (__v2df) _mm_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_maskz_sqrt_round_sd(U, A, B, R) \ + ((__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(U), (int)(R))) + +#define _mm_sqrt_round_ss(A, B, R) \ + ((__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)-1, (int)(R))) + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask_sqrt_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_sqrtss_round_mask ( (__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_mask_sqrt_round_ss(W, U, A, B, R) \ + ((__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)(__m128)(W), (__mmask8)(U), \ + (int)(R))) + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_sqrt_ss (__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128) __builtin_ia32_sqrtss_round_mask ( (__v4sf) __A, + (__v4sf) __B, + (__v4sf) _mm_setzero_ps (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_maskz_sqrt_round_ss(U, A, B, R) \ + ((__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_broadcast_f32x4(__m128 __A) +{ + return (__m512)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A, + 0, 1, 2, 3, 0, 1, 2, 3, + 0, 1, 2, 3, 0, 1, 2, 3); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask_broadcast_f32x4(__m512 __O, __mmask16 __M, __m128 __A) +{ + return (__m512)__builtin_ia32_selectps_512((__mmask16)__M, + (__v16sf)_mm512_broadcast_f32x4(__A), + (__v16sf)__O); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_maskz_broadcast_f32x4(__mmask16 __M, __m128 __A) +{ + return (__m512)__builtin_ia32_selectps_512((__mmask16)__M, + (__v16sf)_mm512_broadcast_f32x4(__A), + (__v16sf)_mm512_setzero_ps()); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_broadcast_f64x4(__m256d __A) +{ + return (__m512d)__builtin_shufflevector((__v4df)__A, (__v4df)__A, + 0, 1, 2, 3, 0, 1, 2, 3); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_mask_broadcast_f64x4(__m512d __O, __mmask8 __M, __m256d __A) +{ + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M, + (__v8df)_mm512_broadcast_f64x4(__A), + (__v8df)__O); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_maskz_broadcast_f64x4(__mmask8 __M, __m256d __A) +{ + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M, + (__v8df)_mm512_broadcast_f64x4(__A), + (__v8df)_mm512_setzero_pd()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_broadcast_i32x4(__m128i __A) +{ + return (__m512i)__builtin_shufflevector((__v4si)__A, (__v4si)__A, + 0, 1, 2, 3, 0, 1, 2, 3, + 0, 1, 2, 3, 0, 1, 2, 3); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_broadcast_i32x4(__m512i __O, __mmask16 __M, __m128i __A) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, + (__v16si)_mm512_broadcast_i32x4(__A), + (__v16si)__O); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_broadcast_i32x4(__mmask16 __M, __m128i __A) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, + (__v16si)_mm512_broadcast_i32x4(__A), + (__v16si)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_broadcast_i64x4(__m256i __A) +{ + return (__m512i)__builtin_shufflevector((__v4di)__A, (__v4di)__A, + 0, 1, 2, 3, 0, 1, 2, 3); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_broadcast_i64x4(__m512i __O, __mmask8 __M, __m256i __A) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, + (__v8di)_mm512_broadcast_i64x4(__A), + (__v8di)__O); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_broadcast_i64x4(__mmask8 __M, __m256i __A) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, + (__v8di)_mm512_broadcast_i64x4(__A), + (__v8di)_mm512_setzero_si512()); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_mask_broadcastsd_pd (__m512d __O, __mmask8 __M, __m128d __A) +{ + return (__m512d)__builtin_ia32_selectpd_512(__M, + (__v8df) _mm512_broadcastsd_pd(__A), + (__v8df) __O); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A) +{ + return (__m512d)__builtin_ia32_selectpd_512(__M, + (__v8df) _mm512_broadcastsd_pd(__A), + (__v8df) _mm512_setzero_pd()); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask_broadcastss_ps (__m512 __O, __mmask16 __M, __m128 __A) +{ + return (__m512)__builtin_ia32_selectps_512(__M, + (__v16sf) _mm512_broadcastss_ps(__A), + (__v16sf) __O); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_maskz_broadcastss_ps (__mmask16 __M, __m128 __A) +{ + return (__m512)__builtin_ia32_selectps_512(__M, + (__v16sf) _mm512_broadcastss_ps(__A), + (__v16sf) _mm512_setzero_ps()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS512 +_mm512_cvtsepi32_epi8 (__m512i __A) +{ + return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A, + (__v16qi) _mm_undefined_si128 (), + (__mmask16) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtsepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A, + (__v16qi) __O, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtsepi32_epi8 (__mmask16 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A, + (__v16qi) _mm_setzero_si128 (), + __M); +} + +static __inline__ void __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A) +{ + __builtin_ia32_pmovsdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_cvtsepi32_epi16 (__m512i __A) +{ + return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A, + (__v16hi) _mm256_undefined_si256 (), + (__mmask16) -1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtsepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A) +{ + return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A, + (__v16hi) __O, __M); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtsepi32_epi16 (__mmask16 __M, __m512i __A) +{ + return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A, + (__v16hi) _mm256_setzero_si256 (), + __M); +} + +static __inline__ void __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtsepi32_storeu_epi16 (void *__P, __mmask16 __M, __m512i __A) +{ + __builtin_ia32_pmovsdw512mem_mask ((__v16hi*) __P, (__v16si) __A, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS512 +_mm512_cvtsepi64_epi8 (__m512i __A) +{ + return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A, + (__v16qi) _mm_undefined_si128 (), + (__mmask8) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A, + (__v16qi) __O, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtsepi64_epi8 (__mmask8 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A, + (__v16qi) _mm_setzero_si128 (), + __M); +} + +static __inline__ void __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A) +{ + __builtin_ia32_pmovsqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_cvtsepi64_epi32 (__m512i __A) +{ + return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A, + (__v8si) _mm256_undefined_si256 (), + (__mmask8) -1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtsepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A) +{ + return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A, + (__v8si) __O, __M); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtsepi64_epi32 (__mmask8 __M, __m512i __A) +{ + return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A, + (__v8si) _mm256_setzero_si256 (), + __M); +} + +static __inline__ void __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtsepi64_storeu_epi32 (void *__P, __mmask8 __M, __m512i __A) +{ + __builtin_ia32_pmovsqd512mem_mask ((__v8si *) __P, (__v8di) __A, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS512 +_mm512_cvtsepi64_epi16 (__m512i __A) +{ + return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A, + (__v8hi) _mm_undefined_si128 (), + (__mmask8) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A, + (__v8hi) __O, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtsepi64_epi16 (__mmask8 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A, + (__v8hi) _mm_setzero_si128 (), + __M); +} + +static __inline__ void __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m512i __A) +{ + __builtin_ia32_pmovsqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS512 +_mm512_cvtusepi32_epi8 (__m512i __A) +{ + return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A, + (__v16qi) _mm_undefined_si128 (), + (__mmask16) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtusepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A, + (__v16qi) __O, + __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtusepi32_epi8 (__mmask16 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A, + (__v16qi) _mm_setzero_si128 (), + __M); +} + +static __inline__ void __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A) +{ + __builtin_ia32_pmovusdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_cvtusepi32_epi16 (__m512i __A) +{ + return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A, + (__v16hi) _mm256_undefined_si256 (), + (__mmask16) -1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtusepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A) +{ + return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A, + (__v16hi) __O, + __M); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtusepi32_epi16 (__mmask16 __M, __m512i __A) +{ + return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A, + (__v16hi) _mm256_setzero_si256 (), + __M); +} + +static __inline__ void __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtusepi32_storeu_epi16 (void *__P, __mmask16 __M, __m512i __A) +{ + __builtin_ia32_pmovusdw512mem_mask ((__v16hi*) __P, (__v16si) __A, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS512 +_mm512_cvtusepi64_epi8 (__m512i __A) +{ + return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A, + (__v16qi) _mm_undefined_si128 (), + (__mmask8) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A, + (__v16qi) __O, + __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtusepi64_epi8 (__mmask8 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A, + (__v16qi) _mm_setzero_si128 (), + __M); +} + +static __inline__ void __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A) +{ + __builtin_ia32_pmovusqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_cvtusepi64_epi32 (__m512i __A) +{ + return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A, + (__v8si) _mm256_undefined_si256 (), + (__mmask8) -1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtusepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A) +{ + return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A, + (__v8si) __O, __M); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtusepi64_epi32 (__mmask8 __M, __m512i __A) +{ + return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A, + (__v8si) _mm256_setzero_si256 (), + __M); +} + +static __inline__ void __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtusepi64_storeu_epi32 (void* __P, __mmask8 __M, __m512i __A) +{ + __builtin_ia32_pmovusqd512mem_mask ((__v8si*) __P, (__v8di) __A, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS512 +_mm512_cvtusepi64_epi16 (__m512i __A) +{ + return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A, + (__v8hi) _mm_undefined_si128 (), + (__mmask8) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A, + (__v8hi) __O, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtusepi64_epi16 (__mmask8 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A, + (__v8hi) _mm_setzero_si128 (), + __M); +} + +static __inline__ void __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtusepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A) +{ + __builtin_ia32_pmovusqw512mem_mask ((__v8hi*) __P, (__v8di) __A, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS512 +_mm512_cvtepi32_epi8 (__m512i __A) +{ + return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A, + (__v16qi) _mm_undefined_si128 (), + (__mmask16) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A, + (__v16qi) __O, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtepi32_epi8 (__mmask16 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A, + (__v16qi) _mm_setzero_si128 (), + __M); +} + +static __inline__ void __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A) +{ + __builtin_ia32_pmovdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_cvtepi32_epi16 (__m512i __A) +{ + return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A, + (__v16hi) _mm256_undefined_si256 (), + (__mmask16) -1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A) +{ + return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A, + (__v16hi) __O, __M); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtepi32_epi16 (__mmask16 __M, __m512i __A) +{ + return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A, + (__v16hi) _mm256_setzero_si256 (), + __M); +} + +static __inline__ void __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtepi32_storeu_epi16 (void * __P, __mmask16 __M, __m512i __A) +{ + __builtin_ia32_pmovdw512mem_mask ((__v16hi *) __P, (__v16si) __A, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS512 +_mm512_cvtepi64_epi8 (__m512i __A) +{ + return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A, + (__v16qi) _mm_undefined_si128 (), + (__mmask8) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A, + (__v16qi) __O, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtepi64_epi8 (__mmask8 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A, + (__v16qi) _mm_setzero_si128 (), + __M); +} + +static __inline__ void __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A) +{ + __builtin_ia32_pmovqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_cvtepi64_epi32 (__m512i __A) +{ + return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A, + (__v8si) _mm256_undefined_si256 (), + (__mmask8) -1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A) +{ + return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A, + (__v8si) __O, __M); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtepi64_epi32 (__mmask8 __M, __m512i __A) +{ + return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A, + (__v8si) _mm256_setzero_si256 (), + __M); +} + +static __inline__ void __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtepi64_storeu_epi32 (void* __P, __mmask8 __M, __m512i __A) +{ + __builtin_ia32_pmovqd512mem_mask ((__v8si *) __P, (__v8di) __A, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS512 +_mm512_cvtepi64_epi16 (__m512i __A) +{ + return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A, + (__v8hi) _mm_undefined_si128 (), + (__mmask8) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A, + (__v8hi) __O, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtepi64_epi16 (__mmask8 __M, __m512i __A) +{ + return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A, + (__v8hi) _mm_setzero_si128 (), + __M); +} + +static __inline__ void __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A) +{ + __builtin_ia32_pmovqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M); +} + +#define _mm512_extracti32x4_epi32(A, imm) \ + ((__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \ + (__v4si)_mm_undefined_si128(), \ + (__mmask8)-1)) + +#define _mm512_mask_extracti32x4_epi32(W, U, A, imm) \ + ((__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \ + (__v4si)(__m128i)(W), \ + (__mmask8)(U))) + +#define _mm512_maskz_extracti32x4_epi32(U, A, imm) \ + ((__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \ + (__v4si)_mm_setzero_si128(), \ + (__mmask8)(U))) + +#define _mm512_extracti64x4_epi64(A, imm) \ + ((__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \ + (__v4di)_mm256_undefined_si256(), \ + (__mmask8)-1)) + +#define _mm512_mask_extracti64x4_epi64(W, U, A, imm) \ + ((__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \ + (__v4di)(__m256i)(W), \ + (__mmask8)(U))) + +#define _mm512_maskz_extracti64x4_epi64(U, A, imm) \ + ((__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \ + (__v4di)_mm256_setzero_si256(), \ + (__mmask8)(U))) + +#define _mm512_insertf64x4(A, B, imm) \ + ((__m512d)__builtin_ia32_insertf64x4((__v8df)(__m512d)(A), \ + (__v4df)(__m256d)(B), (int)(imm))) + +#define _mm512_mask_insertf64x4(W, U, A, B, imm) \ + ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__v8df)_mm512_insertf64x4((A), (B), (imm)), \ + (__v8df)(__m512d)(W))) + +#define _mm512_maskz_insertf64x4(U, A, B, imm) \ + ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__v8df)_mm512_insertf64x4((A), (B), (imm)), \ + (__v8df)_mm512_setzero_pd())) + +#define _mm512_inserti64x4(A, B, imm) \ + ((__m512i)__builtin_ia32_inserti64x4((__v8di)(__m512i)(A), \ + (__v4di)(__m256i)(B), (int)(imm))) + +#define _mm512_mask_inserti64x4(W, U, A, B, imm) \ + ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ + (__v8di)_mm512_inserti64x4((A), (B), (imm)), \ + (__v8di)(__m512i)(W))) + +#define _mm512_maskz_inserti64x4(U, A, B, imm) \ + ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ + (__v8di)_mm512_inserti64x4((A), (B), (imm)), \ + (__v8di)_mm512_setzero_si512())) + +#define _mm512_insertf32x4(A, B, imm) \ + ((__m512)__builtin_ia32_insertf32x4((__v16sf)(__m512)(A), \ + (__v4sf)(__m128)(B), (int)(imm))) + +#define _mm512_mask_insertf32x4(W, U, A, B, imm) \ + ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + (__v16sf)_mm512_insertf32x4((A), (B), (imm)), \ + (__v16sf)(__m512)(W))) + +#define _mm512_maskz_insertf32x4(U, A, B, imm) \ + ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ + (__v16sf)_mm512_insertf32x4((A), (B), (imm)), \ + (__v16sf)_mm512_setzero_ps())) + +#define _mm512_inserti32x4(A, B, imm) \ + ((__m512i)__builtin_ia32_inserti32x4((__v16si)(__m512i)(A), \ + (__v4si)(__m128i)(B), (int)(imm))) + +#define _mm512_mask_inserti32x4(W, U, A, B, imm) \ + ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ + (__v16si)_mm512_inserti32x4((A), (B), (imm)), \ + (__v16si)(__m512i)(W))) + +#define _mm512_maskz_inserti32x4(U, A, B, imm) \ + ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ + (__v16si)_mm512_inserti32x4((A), (B), (imm)), \ + (__v16si)_mm512_setzero_si512())) + +#define _mm512_getmant_round_pd(A, B, C, R) \ + ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \ + (int)(((C)<<2) | (B)), \ + (__v8df)_mm512_undefined_pd(), \ + (__mmask8)-1, (int)(R))) + +#define _mm512_mask_getmant_round_pd(W, U, A, B, C, R) \ + ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \ + (int)(((C)<<2) | (B)), \ + (__v8df)(__m512d)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm512_maskz_getmant_round_pd(U, A, B, C, R) \ + ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \ + (int)(((C)<<2) | (B)), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)(U), (int)(R))) + +#define _mm512_getmant_pd(A, B, C) \ + ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \ + (int)(((C)<<2) | (B)), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_getmant_pd(W, U, A, B, C) \ + ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \ + (int)(((C)<<2) | (B)), \ + (__v8df)(__m512d)(W), \ + (__mmask8)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_maskz_getmant_pd(U, A, B, C) \ + ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \ + (int)(((C)<<2) | (B)), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_getmant_round_ps(A, B, C, R) \ + ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \ + (int)(((C)<<2) | (B)), \ + (__v16sf)_mm512_undefined_ps(), \ + (__mmask16)-1, (int)(R))) + +#define _mm512_mask_getmant_round_ps(W, U, A, B, C, R) \ + ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \ + (int)(((C)<<2) | (B)), \ + (__v16sf)(__m512)(W), \ + (__mmask16)(U), (int)(R))) + +#define _mm512_maskz_getmant_round_ps(U, A, B, C, R) \ + ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \ + (int)(((C)<<2) | (B)), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)(U), (int)(R))) + +#define _mm512_getmant_ps(A, B, C) \ + ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \ + (int)(((C)<<2)|(B)), \ + (__v16sf)_mm512_undefined_ps(), \ + (__mmask16)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_getmant_ps(W, U, A, B, C) \ + ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \ + (int)(((C)<<2)|(B)), \ + (__v16sf)(__m512)(W), \ + (__mmask16)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_maskz_getmant_ps(U, A, B, C) \ + ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \ + (int)(((C)<<2)|(B)), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)(U), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_getexp_round_pd(A, R) \ + ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \ + (__v8df)_mm512_undefined_pd(), \ + (__mmask8)-1, (int)(R))) + +#define _mm512_mask_getexp_round_pd(W, U, A, R) \ + ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \ + (__v8df)(__m512d)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm512_maskz_getexp_round_pd(U, A, R) \ + ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_getexp_pd (__m512d __A) +{ + return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A, + (__v8df) _mm512_undefined_pd (), + (__mmask8) -1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_mask_getexp_pd (__m512d __W, __mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_maskz_getexp_pd (__mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A, + (__v8df) _mm512_setzero_pd (), + (__mmask8) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_getexp_round_ps(A, R) \ + ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)_mm512_undefined_ps(), \ + (__mmask16)-1, (int)(R))) + +#define _mm512_mask_getexp_round_ps(W, U, A, R) \ + ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)(__m512)(W), \ + (__mmask16)(U), (int)(R))) + +#define _mm512_maskz_getexp_round_ps(U, A, R) \ + ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \ + (__v16sf)_mm512_setzero_ps(), \ + (__mmask16)(U), (int)(R))) + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_getexp_ps (__m512 __A) +{ + return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A, + (__v16sf) _mm512_undefined_ps (), + (__mmask16) -1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask_getexp_ps (__m512 __W, __mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_maskz_getexp_ps (__mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A, + (__v16sf) _mm512_setzero_ps (), + (__mmask16) __U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_i64gather_ps(index, addr, scale) \ + ((__m256)__builtin_ia32_gatherdiv16sf((__v8sf)_mm256_undefined_ps(), \ + (void const *)(addr), \ + (__v8di)(__m512i)(index), (__mmask8)-1, \ + (int)(scale))) + +#define _mm512_mask_i64gather_ps(v1_old, mask, index, addr, scale) \ + ((__m256)__builtin_ia32_gatherdiv16sf((__v8sf)(__m256)(v1_old),\ + (void const *)(addr), \ + (__v8di)(__m512i)(index), \ + (__mmask8)(mask), (int)(scale))) + +#define _mm512_i64gather_epi32(index, addr, scale) \ + ((__m256i)__builtin_ia32_gatherdiv16si((__v8si)_mm256_undefined_si256(), \ + (void const *)(addr), \ + (__v8di)(__m512i)(index), \ + (__mmask8)-1, (int)(scale))) + +#define _mm512_mask_i64gather_epi32(v1_old, mask, index, addr, scale) \ + ((__m256i)__builtin_ia32_gatherdiv16si((__v8si)(__m256i)(v1_old), \ + (void const *)(addr), \ + (__v8di)(__m512i)(index), \ + (__mmask8)(mask), (int)(scale))) + +#define _mm512_i64gather_pd(index, addr, scale) \ + ((__m512d)__builtin_ia32_gatherdiv8df((__v8df)_mm512_undefined_pd(), \ + (void const *)(addr), \ + (__v8di)(__m512i)(index), (__mmask8)-1, \ + (int)(scale))) + +#define _mm512_mask_i64gather_pd(v1_old, mask, index, addr, scale) \ + ((__m512d)__builtin_ia32_gatherdiv8df((__v8df)(__m512d)(v1_old), \ + (void const *)(addr), \ + (__v8di)(__m512i)(index), \ + (__mmask8)(mask), (int)(scale))) + +#define _mm512_i64gather_epi64(index, addr, scale) \ + ((__m512i)__builtin_ia32_gatherdiv8di((__v8di)_mm512_undefined_epi32(), \ + (void const *)(addr), \ + (__v8di)(__m512i)(index), (__mmask8)-1, \ + (int)(scale))) + +#define _mm512_mask_i64gather_epi64(v1_old, mask, index, addr, scale) \ + ((__m512i)__builtin_ia32_gatherdiv8di((__v8di)(__m512i)(v1_old), \ + (void const *)(addr), \ + (__v8di)(__m512i)(index), \ + (__mmask8)(mask), (int)(scale))) + +#define _mm512_i32gather_ps(index, addr, scale) \ + ((__m512)__builtin_ia32_gathersiv16sf((__v16sf)_mm512_undefined_ps(), \ + (void const *)(addr), \ + (__v16si)(__m512)(index), \ + (__mmask16)-1, (int)(scale))) + +#define _mm512_mask_i32gather_ps(v1_old, mask, index, addr, scale) \ + ((__m512)__builtin_ia32_gathersiv16sf((__v16sf)(__m512)(v1_old), \ + (void const *)(addr), \ + (__v16si)(__m512)(index), \ + (__mmask16)(mask), (int)(scale))) + +#define _mm512_i32gather_epi32(index, addr, scale) \ + ((__m512i)__builtin_ia32_gathersiv16si((__v16si)_mm512_undefined_epi32(), \ + (void const *)(addr), \ + (__v16si)(__m512i)(index), \ + (__mmask16)-1, (int)(scale))) + +#define _mm512_mask_i32gather_epi32(v1_old, mask, index, addr, scale) \ + ((__m512i)__builtin_ia32_gathersiv16si((__v16si)(__m512i)(v1_old), \ + (void const *)(addr), \ + (__v16si)(__m512i)(index), \ + (__mmask16)(mask), (int)(scale))) + +#define _mm512_i32gather_pd(index, addr, scale) \ + ((__m512d)__builtin_ia32_gathersiv8df((__v8df)_mm512_undefined_pd(), \ + (void const *)(addr), \ + (__v8si)(__m256i)(index), (__mmask8)-1, \ + (int)(scale))) + +#define _mm512_mask_i32gather_pd(v1_old, mask, index, addr, scale) \ + ((__m512d)__builtin_ia32_gathersiv8df((__v8df)(__m512d)(v1_old), \ + (void const *)(addr), \ + (__v8si)(__m256i)(index), \ + (__mmask8)(mask), (int)(scale))) + +#define _mm512_i32gather_epi64(index, addr, scale) \ + ((__m512i)__builtin_ia32_gathersiv8di((__v8di)_mm512_undefined_epi32(), \ + (void const *)(addr), \ + (__v8si)(__m256i)(index), (__mmask8)-1, \ + (int)(scale))) + +#define _mm512_mask_i32gather_epi64(v1_old, mask, index, addr, scale) \ + ((__m512i)__builtin_ia32_gathersiv8di((__v8di)(__m512i)(v1_old), \ + (void const *)(addr), \ + (__v8si)(__m256i)(index), \ + (__mmask8)(mask), (int)(scale))) + +#define _mm512_i64scatter_ps(addr, index, v1, scale) \ + __builtin_ia32_scatterdiv16sf((void *)(addr), (__mmask8)-1, \ + (__v8di)(__m512i)(index), \ + (__v8sf)(__m256)(v1), (int)(scale)) + +#define _mm512_mask_i64scatter_ps(addr, mask, index, v1, scale) \ + __builtin_ia32_scatterdiv16sf((void *)(addr), (__mmask8)(mask), \ + (__v8di)(__m512i)(index), \ + (__v8sf)(__m256)(v1), (int)(scale)) + +#define _mm512_i64scatter_epi32(addr, index, v1, scale) \ + __builtin_ia32_scatterdiv16si((void *)(addr), (__mmask8)-1, \ + (__v8di)(__m512i)(index), \ + (__v8si)(__m256i)(v1), (int)(scale)) + +#define _mm512_mask_i64scatter_epi32(addr, mask, index, v1, scale) \ + __builtin_ia32_scatterdiv16si((void *)(addr), (__mmask8)(mask), \ + (__v8di)(__m512i)(index), \ + (__v8si)(__m256i)(v1), (int)(scale)) + +#define _mm512_i64scatter_pd(addr, index, v1, scale) \ + __builtin_ia32_scatterdiv8df((void *)(addr), (__mmask8)-1, \ + (__v8di)(__m512i)(index), \ + (__v8df)(__m512d)(v1), (int)(scale)) + +#define _mm512_mask_i64scatter_pd(addr, mask, index, v1, scale) \ + __builtin_ia32_scatterdiv8df((void *)(addr), (__mmask8)(mask), \ + (__v8di)(__m512i)(index), \ + (__v8df)(__m512d)(v1), (int)(scale)) + +#define _mm512_i64scatter_epi64(addr, index, v1, scale) \ + __builtin_ia32_scatterdiv8di((void *)(addr), (__mmask8)-1, \ + (__v8di)(__m512i)(index), \ + (__v8di)(__m512i)(v1), (int)(scale)) + +#define _mm512_mask_i64scatter_epi64(addr, mask, index, v1, scale) \ + __builtin_ia32_scatterdiv8di((void *)(addr), (__mmask8)(mask), \ + (__v8di)(__m512i)(index), \ + (__v8di)(__m512i)(v1), (int)(scale)) + +#define _mm512_i32scatter_ps(addr, index, v1, scale) \ + __builtin_ia32_scattersiv16sf((void *)(addr), (__mmask16)-1, \ + (__v16si)(__m512i)(index), \ + (__v16sf)(__m512)(v1), (int)(scale)) + +#define _mm512_mask_i32scatter_ps(addr, mask, index, v1, scale) \ + __builtin_ia32_scattersiv16sf((void *)(addr), (__mmask16)(mask), \ + (__v16si)(__m512i)(index), \ + (__v16sf)(__m512)(v1), (int)(scale)) + +#define _mm512_i32scatter_epi32(addr, index, v1, scale) \ + __builtin_ia32_scattersiv16si((void *)(addr), (__mmask16)-1, \ + (__v16si)(__m512i)(index), \ + (__v16si)(__m512i)(v1), (int)(scale)) + +#define _mm512_mask_i32scatter_epi32(addr, mask, index, v1, scale) \ + __builtin_ia32_scattersiv16si((void *)(addr), (__mmask16)(mask), \ + (__v16si)(__m512i)(index), \ + (__v16si)(__m512i)(v1), (int)(scale)) + +#define _mm512_i32scatter_pd(addr, index, v1, scale) \ + __builtin_ia32_scattersiv8df((void *)(addr), (__mmask8)-1, \ + (__v8si)(__m256i)(index), \ + (__v8df)(__m512d)(v1), (int)(scale)) + +#define _mm512_mask_i32scatter_pd(addr, mask, index, v1, scale) \ + __builtin_ia32_scattersiv8df((void *)(addr), (__mmask8)(mask), \ + (__v8si)(__m256i)(index), \ + (__v8df)(__m512d)(v1), (int)(scale)) + +#define _mm512_i32scatter_epi64(addr, index, v1, scale) \ + __builtin_ia32_scattersiv8di((void *)(addr), (__mmask8)-1, \ + (__v8si)(__m256i)(index), \ + (__v8di)(__m512i)(v1), (int)(scale)) + +#define _mm512_mask_i32scatter_epi64(addr, mask, index, v1, scale) \ + __builtin_ia32_scattersiv8di((void *)(addr), (__mmask8)(mask), \ + (__v8si)(__m256i)(index), \ + (__v8di)(__m512i)(v1), (int)(scale)) + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask_fmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return __builtin_ia32_vfmaddss3_mask((__v4sf)__W, + (__v4sf)__A, + (__v4sf)__B, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_fmadd_round_ss(A, B, C, R) \ + ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)(__m128)(C), (__mmask8)-1, \ + (int)(R))) + +#define _mm_mask_fmadd_round_ss(W, U, A, B, R) \ + ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \ + (__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), (__mmask8)(U), \ + (int)(R))) + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_fmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) +{ + return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A, + (__v4sf)__B, + (__v4sf)__C, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_maskz_fmadd_round_ss(U, A, B, C, R) \ + ((__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4sf)(__m128)(C), (__mmask8)(U), \ + (int)(R))) + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask3_fmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U) +{ + return __builtin_ia32_vfmaddss3_mask3((__v4sf)__W, + (__v4sf)__X, + (__v4sf)__Y, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_mask3_fmadd_round_ss(W, X, Y, U, R) \ + ((__m128)__builtin_ia32_vfmaddss3_mask3((__v4sf)(__m128)(W), \ + (__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (__mmask8)(U), \ + (int)(R))) + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask_fmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return __builtin_ia32_vfmaddss3_mask((__v4sf)__W, + (__v4sf)__A, + -(__v4sf)__B, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_fmsub_round_ss(A, B, C, R) \ + ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + -(__v4sf)(__m128)(C), (__mmask8)-1, \ + (int)(R))) + +#define _mm_mask_fmsub_round_ss(W, U, A, B, R) \ + ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \ + (__v4sf)(__m128)(A), \ + -(__v4sf)(__m128)(B), (__mmask8)(U), \ + (int)(R))) + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_fmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) +{ + return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A, + (__v4sf)__B, + -(__v4sf)__C, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_maskz_fmsub_round_ss(U, A, B, C, R) \ + ((__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + -(__v4sf)(__m128)(C), (__mmask8)(U), \ + (int)(R))) + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask3_fmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U) +{ + return __builtin_ia32_vfmsubss3_mask3((__v4sf)__W, + (__v4sf)__X, + (__v4sf)__Y, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_mask3_fmsub_round_ss(W, X, Y, U, R) \ + ((__m128)__builtin_ia32_vfmsubss3_mask3((__v4sf)(__m128)(W), \ + (__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (__mmask8)(U), \ + (int)(R))) + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask_fnmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return __builtin_ia32_vfmaddss3_mask((__v4sf)__W, + -(__v4sf)__A, + (__v4sf)__B, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_fnmadd_round_ss(A, B, C, R) \ + ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \ + -(__v4sf)(__m128)(B), \ + (__v4sf)(__m128)(C), (__mmask8)-1, \ + (int)(R))) + +#define _mm_mask_fnmadd_round_ss(W, U, A, B, R) \ + ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \ + -(__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), (__mmask8)(U), \ + (int)(R))) + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_fnmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) +{ + return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A, + -(__v4sf)__B, + (__v4sf)__C, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_maskz_fnmadd_round_ss(U, A, B, C, R) \ + ((__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \ + -(__v4sf)(__m128)(B), \ + (__v4sf)(__m128)(C), (__mmask8)(U), \ + (int)(R))) + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask3_fnmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U) +{ + return __builtin_ia32_vfmaddss3_mask3((__v4sf)__W, + -(__v4sf)__X, + (__v4sf)__Y, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_mask3_fnmadd_round_ss(W, X, Y, U, R) \ + ((__m128)__builtin_ia32_vfmaddss3_mask3((__v4sf)(__m128)(W), \ + -(__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (__mmask8)(U), \ + (int)(R))) + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask_fnmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return __builtin_ia32_vfmaddss3_mask((__v4sf)__W, + -(__v4sf)__A, + -(__v4sf)__B, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_fnmsub_round_ss(A, B, C, R) \ + ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \ + -(__v4sf)(__m128)(B), \ + -(__v4sf)(__m128)(C), (__mmask8)-1, \ + (int)(R))) + +#define _mm_mask_fnmsub_round_ss(W, U, A, B, R) \ + ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \ + -(__v4sf)(__m128)(A), \ + -(__v4sf)(__m128)(B), (__mmask8)(U), \ + (int)(R))) + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_fnmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) +{ + return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A, + -(__v4sf)__B, + -(__v4sf)__C, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_maskz_fnmsub_round_ss(U, A, B, C, R) \ + ((__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \ + -(__v4sf)(__m128)(B), \ + -(__v4sf)(__m128)(C), (__mmask8)(U), \ + (int)(R))) + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask3_fnmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U) +{ + return __builtin_ia32_vfmsubss3_mask3((__v4sf)__W, + -(__v4sf)__X, + (__v4sf)__Y, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_mask3_fnmsub_round_ss(W, X, Y, U, R) \ + ((__m128)__builtin_ia32_vfmsubss3_mask3((__v4sf)(__m128)(W), \ + -(__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (__mmask8)(U), \ + (int)(R))) + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask_fmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return __builtin_ia32_vfmaddsd3_mask((__v2df)__W, + (__v2df)__A, + (__v2df)__B, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_fmadd_round_sd(A, B, C, R) \ + ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)(__m128d)(C), (__mmask8)-1, \ + (int)(R))) + +#define _mm_mask_fmadd_round_sd(W, U, A, B, R) \ + ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \ + (__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), (__mmask8)(U), \ + (int)(R))) + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_maskz_fmadd_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) +{ + return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A, + (__v2df)__B, + (__v2df)__C, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_maskz_fmadd_round_sd(U, A, B, C, R) \ + ((__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2df)(__m128d)(C), (__mmask8)(U), \ + (int)(R))) + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask3_fmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U) +{ + return __builtin_ia32_vfmaddsd3_mask3((__v2df)__W, + (__v2df)__X, + (__v2df)__Y, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_mask3_fmadd_round_sd(W, X, Y, U, R) \ + ((__m128d)__builtin_ia32_vfmaddsd3_mask3((__v2df)(__m128d)(W), \ + (__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), (__mmask8)(U), \ + (int)(R))) + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask_fmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return __builtin_ia32_vfmaddsd3_mask((__v2df)__W, + (__v2df)__A, + -(__v2df)__B, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_fmsub_round_sd(A, B, C, R) \ + ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + -(__v2df)(__m128d)(C), (__mmask8)-1, \ + (int)(R))) + +#define _mm_mask_fmsub_round_sd(W, U, A, B, R) \ + ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \ + (__v2df)(__m128d)(A), \ + -(__v2df)(__m128d)(B), (__mmask8)(U), \ + (int)(R))) + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_maskz_fmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) +{ + return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A, + (__v2df)__B, + -(__v2df)__C, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_maskz_fmsub_round_sd(U, A, B, C, R) \ + ((__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + -(__v2df)(__m128d)(C), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask3_fmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U) +{ + return __builtin_ia32_vfmsubsd3_mask3((__v2df)__W, + (__v2df)__X, + (__v2df)__Y, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_mask3_fmsub_round_sd(W, X, Y, U, R) \ + ((__m128d)__builtin_ia32_vfmsubsd3_mask3((__v2df)(__m128d)(W), \ + (__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask_fnmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return __builtin_ia32_vfmaddsd3_mask((__v2df)__W, + -(__v2df)__A, + (__v2df)__B, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_fnmadd_round_sd(A, B, C, R) \ + ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \ + -(__v2df)(__m128d)(B), \ + (__v2df)(__m128d)(C), (__mmask8)-1, \ + (int)(R))) + +#define _mm_mask_fnmadd_round_sd(W, U, A, B, R) \ + ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \ + -(__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), (__mmask8)(U), \ + (int)(R))) + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_maskz_fnmadd_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) +{ + return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A, + -(__v2df)__B, + (__v2df)__C, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_maskz_fnmadd_round_sd(U, A, B, C, R) \ + ((__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \ + -(__v2df)(__m128d)(B), \ + (__v2df)(__m128d)(C), (__mmask8)(U), \ + (int)(R))) + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask3_fnmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U) +{ + return __builtin_ia32_vfmaddsd3_mask3((__v2df)__W, + -(__v2df)__X, + (__v2df)__Y, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_mask3_fnmadd_round_sd(W, X, Y, U, R) \ + ((__m128d)__builtin_ia32_vfmaddsd3_mask3((__v2df)(__m128d)(W), \ + -(__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), (__mmask8)(U), \ + (int)(R))) + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask_fnmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return __builtin_ia32_vfmaddsd3_mask((__v2df)__W, + -(__v2df)__A, + -(__v2df)__B, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_fnmsub_round_sd(A, B, C, R) \ + ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \ + -(__v2df)(__m128d)(B), \ + -(__v2df)(__m128d)(C), (__mmask8)-1, \ + (int)(R))) + +#define _mm_mask_fnmsub_round_sd(W, U, A, B, R) \ + ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \ + -(__v2df)(__m128d)(A), \ + -(__v2df)(__m128d)(B), (__mmask8)(U), \ + (int)(R))) + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_maskz_fnmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) +{ + return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A, + -(__v2df)__B, + -(__v2df)__C, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_maskz_fnmsub_round_sd(U, A, B, C, R) \ + ((__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \ + -(__v2df)(__m128d)(B), \ + -(__v2df)(__m128d)(C), \ + (__mmask8)(U), \ + (int)(R))) + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask3_fnmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U) +{ + return __builtin_ia32_vfmsubsd3_mask3((__v2df)__W, + -(__v2df)__X, + (__v2df)__Y, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_mask3_fnmsub_round_sd(W, X, Y, U, R) \ + ((__m128d)__builtin_ia32_vfmsubsd3_mask3((__v2df)(__m128d)(W), \ + -(__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), \ + (__mmask8)(U), (int)(R))) + +#define _mm512_permutex_pd(X, C) \ + ((__m512d)__builtin_ia32_permdf512((__v8df)(__m512d)(X), (int)(C))) + +#define _mm512_mask_permutex_pd(W, U, X, C) \ + ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__v8df)_mm512_permutex_pd((X), (C)), \ + (__v8df)(__m512d)(W))) + +#define _mm512_maskz_permutex_pd(U, X, C) \ + ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ + (__v8df)_mm512_permutex_pd((X), (C)), \ + (__v8df)_mm512_setzero_pd())) + +#define _mm512_permutex_epi64(X, C) \ + ((__m512i)__builtin_ia32_permdi512((__v8di)(__m512i)(X), (int)(C))) + +#define _mm512_mask_permutex_epi64(W, U, X, C) \ + ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ + (__v8di)_mm512_permutex_epi64((X), (C)), \ + (__v8di)(__m512i)(W))) + +#define _mm512_maskz_permutex_epi64(U, X, C) \ + ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ + (__v8di)_mm512_permutex_epi64((X), (C)), \ + (__v8di)_mm512_setzero_si512())) + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_permutexvar_pd (__m512i __X, __m512d __Y) +{ + return (__m512d)__builtin_ia32_permvardf512((__v8df) __Y, (__v8di) __X); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_mask_permutexvar_pd (__m512d __W, __mmask8 __U, __m512i __X, __m512d __Y) +{ + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_permutexvar_pd(__X, __Y), + (__v8df)__W); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_maskz_permutexvar_pd (__mmask8 __U, __m512i __X, __m512d __Y) +{ + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_permutexvar_pd(__X, __Y), + (__v8df)_mm512_setzero_pd()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_permutexvar_epi64 (__m512i __X, __m512i __Y) +{ + return (__m512i)__builtin_ia32_permvardi512((__v8di)__Y, (__v8di)__X); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_permutexvar_epi64 (__mmask8 __M, __m512i __X, __m512i __Y) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, + (__v8di)_mm512_permutexvar_epi64(__X, __Y), + (__v8di)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_permutexvar_epi64 (__m512i __W, __mmask8 __M, __m512i __X, + __m512i __Y) +{ + return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, + (__v8di)_mm512_permutexvar_epi64(__X, __Y), + (__v8di)__W); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_permutexvar_ps (__m512i __X, __m512 __Y) +{ + return (__m512)__builtin_ia32_permvarsf512((__v16sf)__Y, (__v16si)__X); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask_permutexvar_ps (__m512 __W, __mmask16 __U, __m512i __X, __m512 __Y) +{ + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_permutexvar_ps(__X, __Y), + (__v16sf)__W); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_maskz_permutexvar_ps (__mmask16 __U, __m512i __X, __m512 __Y) +{ + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_permutexvar_ps(__X, __Y), + (__v16sf)_mm512_setzero_ps()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_permutexvar_epi32 (__m512i __X, __m512i __Y) +{ + return (__m512i)__builtin_ia32_permvarsi512((__v16si)__Y, (__v16si)__X); +} + +#define _mm512_permutevar_epi32 _mm512_permutexvar_epi32 + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_permutexvar_epi32 (__mmask16 __M, __m512i __X, __m512i __Y) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, + (__v16si)_mm512_permutexvar_epi32(__X, __Y), + (__v16si)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_permutexvar_epi32 (__m512i __W, __mmask16 __M, __m512i __X, + __m512i __Y) +{ + return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, + (__v16si)_mm512_permutexvar_epi32(__X, __Y), + (__v16si)__W); +} + +#define _mm512_mask_permutevar_epi32 _mm512_mask_permutexvar_epi32 + +static __inline__ __mmask16 __DEFAULT_FN_ATTRS +_mm512_kand (__mmask16 __A, __mmask16 __B) +{ + return (__mmask16) __builtin_ia32_kandhi ((__mmask16) __A, (__mmask16) __B); +} + +static __inline__ __mmask16 __DEFAULT_FN_ATTRS +_mm512_kandn (__mmask16 __A, __mmask16 __B) +{ + return (__mmask16) __builtin_ia32_kandnhi ((__mmask16) __A, (__mmask16) __B); +} + +static __inline__ __mmask16 __DEFAULT_FN_ATTRS +_mm512_kor (__mmask16 __A, __mmask16 __B) +{ + return (__mmask16) __builtin_ia32_korhi ((__mmask16) __A, (__mmask16) __B); +} + +static __inline__ int __DEFAULT_FN_ATTRS +_mm512_kortestc (__mmask16 __A, __mmask16 __B) +{ + return __builtin_ia32_kortestchi ((__mmask16) __A, (__mmask16) __B); +} + +static __inline__ int __DEFAULT_FN_ATTRS +_mm512_kortestz (__mmask16 __A, __mmask16 __B) +{ + return __builtin_ia32_kortestzhi ((__mmask16) __A, (__mmask16) __B); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_kortestc_mask16_u8(__mmask16 __A, __mmask16 __B) +{ + return (unsigned char)__builtin_ia32_kortestchi(__A, __B); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_kortestz_mask16_u8(__mmask16 __A, __mmask16 __B) +{ + return (unsigned char)__builtin_ia32_kortestzhi(__A, __B); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_kortest_mask16_u8(__mmask16 __A, __mmask16 __B, unsigned char *__C) { + *__C = (unsigned char)__builtin_ia32_kortestchi(__A, __B); + return (unsigned char)__builtin_ia32_kortestzhi(__A, __B); +} + +static __inline__ __mmask16 __DEFAULT_FN_ATTRS +_mm512_kunpackb (__mmask16 __A, __mmask16 __B) +{ + return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) __B); +} + +static __inline__ __mmask16 __DEFAULT_FN_ATTRS +_mm512_kxnor (__mmask16 __A, __mmask16 __B) +{ + return (__mmask16) __builtin_ia32_kxnorhi ((__mmask16) __A, (__mmask16) __B); +} + +static __inline__ __mmask16 __DEFAULT_FN_ATTRS +_mm512_kxor (__mmask16 __A, __mmask16 __B) +{ + return (__mmask16) __builtin_ia32_kxorhi ((__mmask16) __A, (__mmask16) __B); +} + +#define _kand_mask16 _mm512_kand +#define _kandn_mask16 _mm512_kandn +#define _knot_mask16 _mm512_knot +#define _kor_mask16 _mm512_kor +#define _kxnor_mask16 _mm512_kxnor +#define _kxor_mask16 _mm512_kxor + +#define _kshiftli_mask16(A, I) \ + ((__mmask16)__builtin_ia32_kshiftlihi((__mmask16)(A), (unsigned int)(I))) + +#define _kshiftri_mask16(A, I) \ + ((__mmask16)__builtin_ia32_kshiftrihi((__mmask16)(A), (unsigned int)(I))) + +static __inline__ unsigned int __DEFAULT_FN_ATTRS +_cvtmask16_u32(__mmask16 __A) { + return (unsigned int)__builtin_ia32_kmovw((__mmask16)__A); +} + +static __inline__ __mmask16 __DEFAULT_FN_ATTRS +_cvtu32_mask16(unsigned int __A) { + return (__mmask16)__builtin_ia32_kmovw((__mmask16)__A); +} + +static __inline__ __mmask16 __DEFAULT_FN_ATTRS +_load_mask16(__mmask16 *__A) { + return (__mmask16)__builtin_ia32_kmovw(*(__mmask16 *)__A); +} + +static __inline__ void __DEFAULT_FN_ATTRS +_store_mask16(__mmask16 *__A, __mmask16 __B) { + *(__mmask16 *)__A = __builtin_ia32_kmovw((__mmask16)__B); +} + +static __inline__ void __DEFAULT_FN_ATTRS512 +_mm512_stream_si512 (void * __P, __m512i __A) +{ + typedef __v8di __v8di_aligned __attribute__((aligned(64))); + __builtin_nontemporal_store((__v8di_aligned)__A, (__v8di_aligned*)__P); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_stream_load_si512 (void const *__P) +{ + typedef __v8di __v8di_aligned __attribute__((aligned(64))); + return (__m512i) __builtin_nontemporal_load((const __v8di_aligned *)__P); +} + +static __inline__ void __DEFAULT_FN_ATTRS512 +_mm512_stream_pd (void *__P, __m512d __A) +{ + typedef __v8df __v8df_aligned __attribute__((aligned(64))); + __builtin_nontemporal_store((__v8df_aligned)__A, (__v8df_aligned*)__P); +} + +static __inline__ void __DEFAULT_FN_ATTRS512 +_mm512_stream_ps (void *__P, __m512 __A) +{ + typedef __v16sf __v16sf_aligned __attribute__((aligned(64))); + __builtin_nontemporal_store((__v16sf_aligned)__A, (__v16sf_aligned*)__P); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_mask_compress_pd (__m512d __W, __mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) __U); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_maskz_compress_pd (__mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A, + (__v8df) + _mm512_setzero_pd (), + (__mmask8) __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_compress_epi64 (__m512i __W, __mmask8 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A, + (__v8di) __W, + (__mmask8) __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_compress_epi64 (__mmask8 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A, + (__v8di) + _mm512_setzero_si512 (), + (__mmask8) __U); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask_compress_ps (__m512 __W, __mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_maskz_compress_ps (__mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A, + (__v16sf) + _mm512_setzero_ps (), + (__mmask16) __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_compress_epi32 (__m512i __W, __mmask16 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A, + (__v16si) __W, + (__mmask16) __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_compress_epi32 (__mmask16 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A, + (__v16si) + _mm512_setzero_si512 (), + (__mmask16) __U); +} + +#define _mm_cmp_round_ss_mask(X, Y, P, R) \ + ((__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (int)(P), \ + (__mmask8)-1, (int)(R))) + +#define _mm_mask_cmp_round_ss_mask(M, X, Y, P, R) \ + ((__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (int)(P), \ + (__mmask8)(M), (int)(R))) + +#define _mm_cmp_ss_mask(X, Y, P) \ + ((__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (int)(P), \ + (__mmask8)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_cmp_ss_mask(M, X, Y, P) \ + ((__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (int)(P), \ + (__mmask8)(M), \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_cmp_round_sd_mask(X, Y, P, R) \ + ((__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), (int)(P), \ + (__mmask8)-1, (int)(R))) + +#define _mm_mask_cmp_round_sd_mask(M, X, Y, P, R) \ + ((__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), (int)(P), \ + (__mmask8)(M), (int)(R))) + +#define _mm_cmp_sd_mask(X, Y, P) \ + ((__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), (int)(P), \ + (__mmask8)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_cmp_sd_mask(M, X, Y, P) \ + ((__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), (int)(P), \ + (__mmask8)(M), \ + _MM_FROUND_CUR_DIRECTION)) + +/* Bit Test */ + +static __inline __mmask16 __DEFAULT_FN_ATTRS512 +_mm512_test_epi32_mask (__m512i __A, __m512i __B) +{ + return _mm512_cmpneq_epi32_mask (_mm512_and_epi32(__A, __B), + _mm512_setzero_si512()); +} + +static __inline__ __mmask16 __DEFAULT_FN_ATTRS512 +_mm512_mask_test_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B) +{ + return _mm512_mask_cmpneq_epi32_mask (__U, _mm512_and_epi32 (__A, __B), + _mm512_setzero_si512()); +} + +static __inline __mmask8 __DEFAULT_FN_ATTRS512 +_mm512_test_epi64_mask (__m512i __A, __m512i __B) +{ + return _mm512_cmpneq_epi64_mask (_mm512_and_epi32 (__A, __B), + _mm512_setzero_si512()); +} + +static __inline__ __mmask8 __DEFAULT_FN_ATTRS512 +_mm512_mask_test_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B) +{ + return _mm512_mask_cmpneq_epi64_mask (__U, _mm512_and_epi32 (__A, __B), + _mm512_setzero_si512()); +} + +static __inline__ __mmask16 __DEFAULT_FN_ATTRS512 +_mm512_testn_epi32_mask (__m512i __A, __m512i __B) +{ + return _mm512_cmpeq_epi32_mask (_mm512_and_epi32 (__A, __B), + _mm512_setzero_si512()); +} + +static __inline__ __mmask16 __DEFAULT_FN_ATTRS512 +_mm512_mask_testn_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B) +{ + return _mm512_mask_cmpeq_epi32_mask (__U, _mm512_and_epi32 (__A, __B), + _mm512_setzero_si512()); +} + +static __inline__ __mmask8 __DEFAULT_FN_ATTRS512 +_mm512_testn_epi64_mask (__m512i __A, __m512i __B) +{ + return _mm512_cmpeq_epi64_mask (_mm512_and_epi32 (__A, __B), + _mm512_setzero_si512()); +} + +static __inline__ __mmask8 __DEFAULT_FN_ATTRS512 +_mm512_mask_testn_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B) +{ + return _mm512_mask_cmpeq_epi64_mask (__U, _mm512_and_epi32 (__A, __B), + _mm512_setzero_si512()); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_movehdup_ps (__m512 __A) +{ + return (__m512)__builtin_shufflevector((__v16sf)__A, (__v16sf)__A, + 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask_movehdup_ps (__m512 __W, __mmask16 __U, __m512 __A) +{ + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_movehdup_ps(__A), + (__v16sf)__W); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_maskz_movehdup_ps (__mmask16 __U, __m512 __A) +{ + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_movehdup_ps(__A), + (__v16sf)_mm512_setzero_ps()); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_moveldup_ps (__m512 __A) +{ + return (__m512)__builtin_shufflevector((__v16sf)__A, (__v16sf)__A, + 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask_moveldup_ps (__m512 __W, __mmask16 __U, __m512 __A) +{ + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_moveldup_ps(__A), + (__v16sf)__W); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_maskz_moveldup_ps (__mmask16 __U, __m512 __A) +{ + return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_moveldup_ps(__A), + (__v16sf)_mm512_setzero_ps()); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask_move_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return __builtin_ia32_selectss_128(__U, _mm_move_ss(__A, __B), __W); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_move_ss (__mmask8 __U, __m128 __A, __m128 __B) +{ + return __builtin_ia32_selectss_128(__U, _mm_move_ss(__A, __B), + _mm_setzero_ps()); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask_move_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return __builtin_ia32_selectsd_128(__U, _mm_move_sd(__A, __B), __W); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_maskz_move_sd (__mmask8 __U, __m128d __A, __m128d __B) +{ + return __builtin_ia32_selectsd_128(__U, _mm_move_sd(__A, __B), + _mm_setzero_pd()); +} + +static __inline__ void __DEFAULT_FN_ATTRS128 +_mm_mask_store_ss (float * __W, __mmask8 __U, __m128 __A) +{ + __builtin_ia32_storess128_mask ((__v4sf *)__W, __A, __U & 1); +} + +static __inline__ void __DEFAULT_FN_ATTRS128 +_mm_mask_store_sd (double * __W, __mmask8 __U, __m128d __A) +{ + __builtin_ia32_storesd128_mask ((__v2df *)__W, __A, __U & 1); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask_load_ss (__m128 __W, __mmask8 __U, const float* __A) +{ + __m128 src = (__v4sf) __builtin_shufflevector((__v4sf) __W, + (__v4sf)_mm_setzero_ps(), + 0, 4, 4, 4); + + return (__m128) __builtin_ia32_loadss128_mask ((const __v4sf *) __A, src, __U & 1); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_load_ss (__mmask8 __U, const float* __A) +{ + return (__m128)__builtin_ia32_loadss128_mask ((const __v4sf *) __A, + (__v4sf) _mm_setzero_ps(), + __U & 1); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask_load_sd (__m128d __W, __mmask8 __U, const double* __A) +{ + __m128d src = (__v2df) __builtin_shufflevector((__v2df) __W, + (__v2df)_mm_setzero_pd(), + 0, 2); + + return (__m128d) __builtin_ia32_loadsd128_mask ((const __v2df *) __A, src, __U & 1); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_maskz_load_sd (__mmask8 __U, const double* __A) +{ + return (__m128d) __builtin_ia32_loadsd128_mask ((const __v2df *) __A, + (__v2df) _mm_setzero_pd(), + __U & 1); +} + +#define _mm512_shuffle_epi32(A, I) \ + ((__m512i)__builtin_ia32_pshufd512((__v16si)(__m512i)(A), (int)(I))) + +#define _mm512_mask_shuffle_epi32(W, U, A, I) \ + ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ + (__v16si)_mm512_shuffle_epi32((A), (I)), \ + (__v16si)(__m512i)(W))) + +#define _mm512_maskz_shuffle_epi32(U, A, I) \ + ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ + (__v16si)_mm512_shuffle_epi32((A), (I)), \ + (__v16si)_mm512_setzero_si512())) + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_mask_expand_pd (__m512d __W, __mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_expanddf512_mask ((__v8df) __A, + (__v8df) __W, + (__mmask8) __U); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_maskz_expand_pd (__mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_expanddf512_mask ((__v8df) __A, + (__v8df) _mm512_setzero_pd (), + (__mmask8) __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_expand_epi64 (__m512i __W, __mmask8 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_expanddi512_mask ((__v8di) __A, + (__v8di) __W, + (__mmask8) __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_expand_epi64 ( __mmask8 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_expanddi512_mask ((__v8di) __A, + (__v8di) _mm512_setzero_si512 (), + (__mmask8) __U); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_mask_expandloadu_pd(__m512d __W, __mmask8 __U, void const *__P) +{ + return (__m512d) __builtin_ia32_expandloaddf512_mask ((const __v8df *)__P, + (__v8df) __W, + (__mmask8) __U); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_maskz_expandloadu_pd(__mmask8 __U, void const *__P) +{ + return (__m512d) __builtin_ia32_expandloaddf512_mask ((const __v8df *)__P, + (__v8df) _mm512_setzero_pd(), + (__mmask8) __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_expandloadu_epi64(__m512i __W, __mmask8 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_expandloaddi512_mask ((const __v8di *)__P, + (__v8di) __W, + (__mmask8) __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_expandloadu_epi64(__mmask8 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_expandloaddi512_mask ((const __v8di *)__P, + (__v8di) _mm512_setzero_si512(), + (__mmask8) __U); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask_expandloadu_ps(__m512 __W, __mmask16 __U, void const *__P) +{ + return (__m512) __builtin_ia32_expandloadsf512_mask ((const __v16sf *)__P, + (__v16sf) __W, + (__mmask16) __U); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_maskz_expandloadu_ps(__mmask16 __U, void const *__P) +{ + return (__m512) __builtin_ia32_expandloadsf512_mask ((const __v16sf *)__P, + (__v16sf) _mm512_setzero_ps(), + (__mmask16) __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_expandloadu_epi32(__m512i __W, __mmask16 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_expandloadsi512_mask ((const __v16si *)__P, + (__v16si) __W, + (__mmask16) __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_expandloadu_epi32(__mmask16 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_expandloadsi512_mask ((const __v16si *)__P, + (__v16si) _mm512_setzero_si512(), + (__mmask16) __U); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask_expand_ps (__m512 __W, __mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_expandsf512_mask ((__v16sf) __A, + (__v16sf) __W, + (__mmask16) __U); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_maskz_expand_ps (__mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_expandsf512_mask ((__v16sf) __A, + (__v16sf) _mm512_setzero_ps(), + (__mmask16) __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_expand_epi32 (__m512i __W, __mmask16 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_expandsi512_mask ((__v16si) __A, + (__v16si) __W, + (__mmask16) __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_expand_epi32 (__mmask16 __U, __m512i __A) +{ + return (__m512i) __builtin_ia32_expandsi512_mask ((__v16si) __A, + (__v16si) _mm512_setzero_si512(), + (__mmask16) __U); +} + +#define _mm512_cvt_roundps_pd(A, R) \ + ((__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \ + (__v8df)_mm512_undefined_pd(), \ + (__mmask8)-1, (int)(R))) + +#define _mm512_mask_cvt_roundps_pd(W, U, A, R) \ + ((__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \ + (__v8df)(__m512d)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm512_maskz_cvt_roundps_pd(U, A, R) \ + ((__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \ + (__v8df)_mm512_setzero_pd(), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_cvtps_pd (__m256 __A) +{ + return (__m512d) __builtin_convertvector((__v8sf)__A, __v8df); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtps_pd (__m512d __W, __mmask8 __U, __m256 __A) +{ + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_cvtps_pd(__A), + (__v8df)__W); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtps_pd (__mmask8 __U, __m256 __A) +{ + return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, + (__v8df)_mm512_cvtps_pd(__A), + (__v8df)_mm512_setzero_pd()); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_cvtpslo_pd (__m512 __A) +{ + return (__m512d) _mm512_cvtps_pd(_mm512_castps512_ps256(__A)); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtpslo_pd (__m512d __W, __mmask8 __U, __m512 __A) +{ + return (__m512d) _mm512_mask_cvtps_pd(__W, __U, _mm512_castps512_ps256(__A)); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_mask_mov_pd (__m512d __W, __mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U, + (__v8df) __A, + (__v8df) __W); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_maskz_mov_pd (__mmask8 __U, __m512d __A) +{ + return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U, + (__v8df) __A, + (__v8df) _mm512_setzero_pd ()); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask_mov_ps (__m512 __W, __mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U, + (__v16sf) __A, + (__v16sf) __W); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_maskz_mov_ps (__mmask16 __U, __m512 __A) +{ + return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U, + (__v16sf) __A, + (__v16sf) _mm512_setzero_ps ()); +} + +static __inline__ void __DEFAULT_FN_ATTRS512 +_mm512_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m512d __A) +{ + __builtin_ia32_compressstoredf512_mask ((__v8df *) __P, (__v8df) __A, + (__mmask8) __U); +} + +static __inline__ void __DEFAULT_FN_ATTRS512 +_mm512_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m512i __A) +{ + __builtin_ia32_compressstoredi512_mask ((__v8di *) __P, (__v8di) __A, + (__mmask8) __U); +} + +static __inline__ void __DEFAULT_FN_ATTRS512 +_mm512_mask_compressstoreu_ps (void *__P, __mmask16 __U, __m512 __A) +{ + __builtin_ia32_compressstoresf512_mask ((__v16sf *) __P, (__v16sf) __A, + (__mmask16) __U); +} + +static __inline__ void __DEFAULT_FN_ATTRS512 +_mm512_mask_compressstoreu_epi32 (void *__P, __mmask16 __U, __m512i __A) +{ + __builtin_ia32_compressstoresi512_mask ((__v16si *) __P, (__v16si) __A, + (__mmask16) __U); +} + +#define _mm_cvt_roundsd_ss(A, B, R) \ + ((__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \ + (__v2df)(__m128d)(B), \ + (__v4sf)_mm_undefined_ps(), \ + (__mmask8)-1, (int)(R))) + +#define _mm_mask_cvt_roundsd_ss(W, U, A, B, R) \ + ((__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \ + (__v2df)(__m128d)(B), \ + (__v4sf)(__m128)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm_maskz_cvt_roundsd_ss(U, A, B, R) \ + ((__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \ + (__v2df)(__m128d)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask_cvtsd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128d __B) +{ + return __builtin_ia32_cvtsd2ss_round_mask ((__v4sf)__A, + (__v2df)__B, + (__v4sf)__W, + (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtsd_ss (__mmask8 __U, __m128 __A, __m128d __B) +{ + return __builtin_ia32_cvtsd2ss_round_mask ((__v4sf)__A, + (__v2df)__B, + (__v4sf)_mm_setzero_ps(), + (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_cvtss_i32 _mm_cvtss_si32 +#define _mm_cvtsd_i32 _mm_cvtsd_si32 +#define _mm_cvti32_sd _mm_cvtsi32_sd +#define _mm_cvti32_ss _mm_cvtsi32_ss +#ifdef __x86_64__ +#define _mm_cvtss_i64 _mm_cvtss_si64 +#define _mm_cvtsd_i64 _mm_cvtsd_si64 +#define _mm_cvti64_sd _mm_cvtsi64_sd +#define _mm_cvti64_ss _mm_cvtsi64_ss +#endif + +#ifdef __x86_64__ +#define _mm_cvt_roundi64_sd(A, B, R) \ + ((__m128d)__builtin_ia32_cvtsi2sd64((__v2df)(__m128d)(A), (long long)(B), \ + (int)(R))) + +#define _mm_cvt_roundsi64_sd(A, B, R) \ + ((__m128d)__builtin_ia32_cvtsi2sd64((__v2df)(__m128d)(A), (long long)(B), \ + (int)(R))) +#endif + +#define _mm_cvt_roundsi32_ss(A, B, R) \ + ((__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R))) + +#define _mm_cvt_roundi32_ss(A, B, R) \ + ((__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R))) + +#ifdef __x86_64__ +#define _mm_cvt_roundsi64_ss(A, B, R) \ + ((__m128)__builtin_ia32_cvtsi2ss64((__v4sf)(__m128)(A), (long long)(B), \ + (int)(R))) + +#define _mm_cvt_roundi64_ss(A, B, R) \ + ((__m128)__builtin_ia32_cvtsi2ss64((__v4sf)(__m128)(A), (long long)(B), \ + (int)(R))) +#endif + +#define _mm_cvt_roundss_sd(A, B, R) \ + ((__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \ + (__v4sf)(__m128)(B), \ + (__v2df)_mm_undefined_pd(), \ + (__mmask8)-1, (int)(R))) + +#define _mm_mask_cvt_roundss_sd(W, U, A, B, R) \ + ((__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \ + (__v4sf)(__m128)(B), \ + (__v2df)(__m128d)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm_maskz_cvt_roundss_sd(U, A, B, R) \ + ((__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \ + (__v4sf)(__m128)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask_cvtss_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128 __B) +{ + return __builtin_ia32_cvtss2sd_round_mask((__v2df)__A, + (__v4sf)__B, + (__v2df)__W, + (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtss_sd (__mmask8 __U, __m128d __A, __m128 __B) +{ + return __builtin_ia32_cvtss2sd_round_mask((__v2df)__A, + (__v4sf)__B, + (__v2df)_mm_setzero_pd(), + (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_cvtu32_sd (__m128d __A, unsigned __B) +{ + __A[0] = __B; + return __A; +} + +#ifdef __x86_64__ +#define _mm_cvt_roundu64_sd(A, B, R) \ + ((__m128d)__builtin_ia32_cvtusi2sd64((__v2df)(__m128d)(A), \ + (unsigned long long)(B), (int)(R))) + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_cvtu64_sd (__m128d __A, unsigned long long __B) +{ + __A[0] = __B; + return __A; +} +#endif + +#define _mm_cvt_roundu32_ss(A, B, R) \ + ((__m128)__builtin_ia32_cvtusi2ss32((__v4sf)(__m128)(A), (unsigned int)(B), \ + (int)(R))) + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_cvtu32_ss (__m128 __A, unsigned __B) +{ + __A[0] = __B; + return __A; +} + +#ifdef __x86_64__ +#define _mm_cvt_roundu64_ss(A, B, R) \ + ((__m128)__builtin_ia32_cvtusi2ss64((__v4sf)(__m128)(A), \ + (unsigned long long)(B), (int)(R))) + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_cvtu64_ss (__m128 __A, unsigned long long __B) +{ + __A[0] = __B; + return __A; +} +#endif + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_set1_epi32 (__m512i __O, __mmask16 __M, int __A) +{ + return (__m512i) __builtin_ia32_selectd_512(__M, + (__v16si) _mm512_set1_epi32(__A), + (__v16si) __O); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_set1_epi64 (__m512i __O, __mmask8 __M, long long __A) +{ + return (__m512i) __builtin_ia32_selectq_512(__M, + (__v8di) _mm512_set1_epi64(__A), + (__v8di) __O); +} + +static __inline __m512i __DEFAULT_FN_ATTRS512 +_mm512_set_epi8 (char __e63, char __e62, char __e61, char __e60, char __e59, + char __e58, char __e57, char __e56, char __e55, char __e54, char __e53, + char __e52, char __e51, char __e50, char __e49, char __e48, char __e47, + char __e46, char __e45, char __e44, char __e43, char __e42, char __e41, + char __e40, char __e39, char __e38, char __e37, char __e36, char __e35, + char __e34, char __e33, char __e32, char __e31, char __e30, char __e29, + char __e28, char __e27, char __e26, char __e25, char __e24, char __e23, + char __e22, char __e21, char __e20, char __e19, char __e18, char __e17, + char __e16, char __e15, char __e14, char __e13, char __e12, char __e11, + char __e10, char __e9, char __e8, char __e7, char __e6, char __e5, + char __e4, char __e3, char __e2, char __e1, char __e0) { + + return __extension__ (__m512i)(__v64qi) + {__e0, __e1, __e2, __e3, __e4, __e5, __e6, __e7, + __e8, __e9, __e10, __e11, __e12, __e13, __e14, __e15, + __e16, __e17, __e18, __e19, __e20, __e21, __e22, __e23, + __e24, __e25, __e26, __e27, __e28, __e29, __e30, __e31, + __e32, __e33, __e34, __e35, __e36, __e37, __e38, __e39, + __e40, __e41, __e42, __e43, __e44, __e45, __e46, __e47, + __e48, __e49, __e50, __e51, __e52, __e53, __e54, __e55, + __e56, __e57, __e58, __e59, __e60, __e61, __e62, __e63}; +} + +static __inline __m512i __DEFAULT_FN_ATTRS512 +_mm512_set_epi16(short __e31, short __e30, short __e29, short __e28, + short __e27, short __e26, short __e25, short __e24, short __e23, + short __e22, short __e21, short __e20, short __e19, short __e18, + short __e17, short __e16, short __e15, short __e14, short __e13, + short __e12, short __e11, short __e10, short __e9, short __e8, + short __e7, short __e6, short __e5, short __e4, short __e3, + short __e2, short __e1, short __e0) { + return __extension__ (__m512i)(__v32hi) + {__e0, __e1, __e2, __e3, __e4, __e5, __e6, __e7, + __e8, __e9, __e10, __e11, __e12, __e13, __e14, __e15, + __e16, __e17, __e18, __e19, __e20, __e21, __e22, __e23, + __e24, __e25, __e26, __e27, __e28, __e29, __e30, __e31 }; +} + +static __inline __m512i __DEFAULT_FN_ATTRS512 +_mm512_set_epi32 (int __A, int __B, int __C, int __D, + int __E, int __F, int __G, int __H, + int __I, int __J, int __K, int __L, + int __M, int __N, int __O, int __P) +{ + return __extension__ (__m512i)(__v16si) + { __P, __O, __N, __M, __L, __K, __J, __I, + __H, __G, __F, __E, __D, __C, __B, __A }; +} + +#define _mm512_setr_epi32(e0,e1,e2,e3,e4,e5,e6,e7, \ + e8,e9,e10,e11,e12,e13,e14,e15) \ + _mm512_set_epi32((e15),(e14),(e13),(e12),(e11),(e10),(e9),(e8),(e7),(e6), \ + (e5),(e4),(e3),(e2),(e1),(e0)) + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_set_epi64 (long long __A, long long __B, long long __C, + long long __D, long long __E, long long __F, + long long __G, long long __H) +{ + return __extension__ (__m512i) (__v8di) + { __H, __G, __F, __E, __D, __C, __B, __A }; +} + +#define _mm512_setr_epi64(e0,e1,e2,e3,e4,e5,e6,e7) \ + _mm512_set_epi64((e7),(e6),(e5),(e4),(e3),(e2),(e1),(e0)) + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_set_pd (double __A, double __B, double __C, double __D, + double __E, double __F, double __G, double __H) +{ + return __extension__ (__m512d) + { __H, __G, __F, __E, __D, __C, __B, __A }; +} + +#define _mm512_setr_pd(e0,e1,e2,e3,e4,e5,e6,e7) \ + _mm512_set_pd((e7),(e6),(e5),(e4),(e3),(e2),(e1),(e0)) + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_set_ps (float __A, float __B, float __C, float __D, + float __E, float __F, float __G, float __H, + float __I, float __J, float __K, float __L, + float __M, float __N, float __O, float __P) +{ + return __extension__ (__m512) + { __P, __O, __N, __M, __L, __K, __J, __I, + __H, __G, __F, __E, __D, __C, __B, __A }; +} + +#define _mm512_setr_ps(e0,e1,e2,e3,e4,e5,e6,e7,e8,e9,e10,e11,e12,e13,e14,e15) \ + _mm512_set_ps((e15),(e14),(e13),(e12),(e11),(e10),(e9),(e8),(e7),(e6),(e5), \ + (e4),(e3),(e2),(e1),(e0)) + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_abs_ps(__m512 __A) +{ + return (__m512)_mm512_and_epi32(_mm512_set1_epi32(0x7FFFFFFF),(__m512i)__A) ; +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask_abs_ps(__m512 __W, __mmask16 __K, __m512 __A) +{ + return (__m512)_mm512_mask_and_epi32((__m512i)__W, __K, _mm512_set1_epi32(0x7FFFFFFF),(__m512i)__A) ; +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_abs_pd(__m512d __A) +{ + return (__m512d)_mm512_and_epi64(_mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)__A) ; +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_mask_abs_pd(__m512d __W, __mmask8 __K, __m512d __A) +{ + return (__m512d)_mm512_mask_and_epi64((__v8di)__W, __K, _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)__A); +} + +/* Vector-reduction arithmetic accepts vectors as inputs and produces scalars as + * outputs. This class of vector operation forms the basis of many scientific + * computations. In vector-reduction arithmetic, the evaluation order is + * independent of the order of the input elements of V. + + * For floating-point intrinsics: + * 1. When using fadd/fmul intrinsics, the order of operations within the + * vector is unspecified (associative math). + * 2. When using fmin/fmax intrinsics, NaN or -0.0 elements within the vector + * produce unspecified results. + + * Used bisection method. At each step, we partition the vector with previous + * step in half, and the operation is performed on its two halves. + * This takes log2(n) steps where n is the number of elements in the vector. + */ + +static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_add_epi64(__m512i __W) { +#if (__clang_major__ > 14) + return __builtin_reduce_add((__v8di)__W); +#else + return __builtin_ia32_reduce_add_q512(__W); +#endif +} + +static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_epi64(__m512i __W) { +#if (__clang_major__ > 14) + return __builtin_reduce_mul((__v8di)__W); +#else + return __builtin_ia32_reduce_mul_q512(__W); +#endif +} + +static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_and_epi64(__m512i __W) { +#if (__clang_major__ < 14) + return __builtin_ia32_reduce_and_q512(__W); +#else + return __builtin_reduce_and((__v8di)__W); +#endif +} + +static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_or_epi64(__m512i __W) { +#if (__clang_major__ < 14) + return __builtin_ia32_reduce_or_q512(__W); +#else + return __builtin_reduce_or((__v8di)__W); +#endif +} + +static __inline__ long long __DEFAULT_FN_ATTRS512 +_mm512_mask_reduce_add_epi64(__mmask8 __M, __m512i __W) { + __W = _mm512_maskz_mov_epi64(__M, __W); +#if (__clang_major__ > 14) + return __builtin_reduce_add((__v8di)__W); +#else + return __builtin_ia32_reduce_add_q512(__W); +#endif +} + +static __inline__ long long __DEFAULT_FN_ATTRS512 +_mm512_mask_reduce_mul_epi64(__mmask8 __M, __m512i __W) { + __W = _mm512_mask_mov_epi64(_mm512_set1_epi64(1), __M, __W); +#if (__clang_major__ > 14) + return __builtin_reduce_mul((__v8di)__W); +#else + return __builtin_ia32_reduce_mul_q512(__W); +#endif +} + +static __inline__ long long __DEFAULT_FN_ATTRS512 +_mm512_mask_reduce_and_epi64(__mmask8 __M, __m512i __W) { + __W = _mm512_mask_mov_epi64(_mm512_set1_epi64(~0ULL), __M, __W); +#if (__clang_major__ < 14) + return __builtin_ia32_reduce_and_q512(__W); +#else + return __builtin_reduce_and((__v8di)__W); +#endif +} + +static __inline__ long long __DEFAULT_FN_ATTRS512 +_mm512_mask_reduce_or_epi64(__mmask8 __M, __m512i __W) { + __W = _mm512_maskz_mov_epi64(__M, __W); +#if (__clang_major__ < 14) + return __builtin_ia32_reduce_or_q512(__W); +#else + return __builtin_reduce_or((__v8di)__W); +#endif +} + +// -0.0 is used to ignore the start value since it is the neutral value of +// floating point addition. For more information, please refer to +// https://llvm.org/docs/LangRef.html#llvm-vector-reduce-fadd-intrinsic +static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_reduce_add_pd(__m512d __W) { + return __builtin_ia32_reduce_fadd_pd512(-0.0, __W); +} + +static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_pd(__m512d __W) { + return __builtin_ia32_reduce_fmul_pd512(1.0, __W); +} + +static __inline__ double __DEFAULT_FN_ATTRS512 +_mm512_mask_reduce_add_pd(__mmask8 __M, __m512d __W) { + __W = _mm512_maskz_mov_pd(__M, __W); + return __builtin_ia32_reduce_fadd_pd512(-0.0, __W); +} + +static __inline__ double __DEFAULT_FN_ATTRS512 +_mm512_mask_reduce_mul_pd(__mmask8 __M, __m512d __W) { + __W = _mm512_mask_mov_pd(_mm512_set1_pd(1.0), __M, __W); + return __builtin_ia32_reduce_fmul_pd512(1.0, __W); +} + +static __inline__ int __DEFAULT_FN_ATTRS512 +_mm512_reduce_add_epi32(__m512i __W) { +#if (__clang_major__ > 14) + return __builtin_reduce_add((__v16si)__W); +#else + return __builtin_ia32_reduce_add_d512((__v16si)__W); +#endif +} + +static __inline__ int __DEFAULT_FN_ATTRS512 +_mm512_reduce_mul_epi32(__m512i __W) { +#if (__clang_major__ > 14) + return __builtin_reduce_mul((__v16si)__W); +#else + return __builtin_ia32_reduce_mul_d512((__v16si)__W); +#endif +} + +static __inline__ int __DEFAULT_FN_ATTRS512 +_mm512_reduce_and_epi32(__m512i __W) { +#if (__clang_major__ < 14) + return __builtin_ia32_reduce_and_d512((__v16si)__W); +#else + return __builtin_reduce_and((__v16si)__W); +#endif +} + +static __inline__ int __DEFAULT_FN_ATTRS512 +_mm512_reduce_or_epi32(__m512i __W) { +#if (__clang_major__ < 14) + return __builtin_ia32_reduce_or_d512((__v16si)__W); +#else + return __builtin_reduce_or((__v16si)__W); +#endif +} + +static __inline__ int __DEFAULT_FN_ATTRS512 +_mm512_mask_reduce_add_epi32( __mmask16 __M, __m512i __W) { + __W = _mm512_maskz_mov_epi32(__M, __W); +#if (__clang_major__ > 14) + return __builtin_reduce_add((__v16si)__W); +#else + return __builtin_ia32_reduce_add_d512((__v16si)__W); +#endif +} + +static __inline__ int __DEFAULT_FN_ATTRS512 +_mm512_mask_reduce_mul_epi32( __mmask16 __M, __m512i __W) { + __W = _mm512_mask_mov_epi32(_mm512_set1_epi32(1), __M, __W); +#if (__clang_major__ > 14) + return __builtin_reduce_mul((__v16si)__W); +#else + return __builtin_ia32_reduce_mul_d512((__v16si)__W); +#endif +} + +static __inline__ int __DEFAULT_FN_ATTRS512 +_mm512_mask_reduce_and_epi32( __mmask16 __M, __m512i __W) { + __W = _mm512_mask_mov_epi32(_mm512_set1_epi32(~0U), __M, __W); +#if (__clang_major__ < 14) + return __builtin_ia32_reduce_and_d512((__v16si)__W); +#else + return __builtin_reduce_and((__v16si)__W); +#endif +} + +static __inline__ int __DEFAULT_FN_ATTRS512 +_mm512_mask_reduce_or_epi32(__mmask16 __M, __m512i __W) { + __W = _mm512_maskz_mov_epi32(__M, __W); +#if (__clang_major__ < 14) + return __builtin_ia32_reduce_or_d512((__v16si)__W); +#else + return __builtin_reduce_or((__v16si)__W); +#endif +} + +static __inline__ float __DEFAULT_FN_ATTRS512 +_mm512_reduce_add_ps(__m512 __W) { + return __builtin_ia32_reduce_fadd_ps512(-0.0f, __W); +} + +static __inline__ float __DEFAULT_FN_ATTRS512 +_mm512_reduce_mul_ps(__m512 __W) { + return __builtin_ia32_reduce_fmul_ps512(1.0f, __W); +} + +static __inline__ float __DEFAULT_FN_ATTRS512 +_mm512_mask_reduce_add_ps(__mmask16 __M, __m512 __W) { + __W = _mm512_maskz_mov_ps(__M, __W); + return __builtin_ia32_reduce_fadd_ps512(-0.0f, __W); +} + +static __inline__ float __DEFAULT_FN_ATTRS512 +_mm512_mask_reduce_mul_ps(__mmask16 __M, __m512 __W) { + __W = _mm512_mask_mov_ps(_mm512_set1_ps(1.0f), __M, __W); + return __builtin_ia32_reduce_fmul_ps512(1.0f, __W); +} + +static __inline__ long long __DEFAULT_FN_ATTRS512 +_mm512_reduce_max_epi64(__m512i __V) { +#if (__clang_major__ < 14) + return __builtin_ia32_reduce_smax_q512(__V); +#else + return __builtin_reduce_max((__v8di)__V); +#endif +} + +static __inline__ unsigned long long __DEFAULT_FN_ATTRS512 +_mm512_reduce_max_epu64(__m512i __V) { +#if (__clang_major__ < 14) + return __builtin_ia32_reduce_umax_q512(__V); +#else + return __builtin_reduce_max((__v8du)__V); +#endif +} + +static __inline__ long long __DEFAULT_FN_ATTRS512 +_mm512_reduce_min_epi64(__m512i __V) { +#if (__clang_major__ < 14) + return __builtin_ia32_reduce_smin_q512(__V); +#else + return __builtin_reduce_min((__v8di)__V); +#endif +} + +static __inline__ unsigned long long __DEFAULT_FN_ATTRS512 +_mm512_reduce_min_epu64(__m512i __V) { +#if (__clang_major__ < 14) + return __builtin_ia32_reduce_umin_q512(__V); +#else + return __builtin_reduce_min((__v8du)__V); +#endif +} + +static __inline__ long long __DEFAULT_FN_ATTRS512 +_mm512_mask_reduce_max_epi64(__mmask8 __M, __m512i __V) { + __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(-__LONG_LONG_MAX__ - 1LL), __M, __V); +#if (__clang_major__ < 14) + return __builtin_ia32_reduce_smax_q512(__V); +#else + return __builtin_reduce_max((__v8di)__V); +#endif +} + +static __inline__ unsigned long long __DEFAULT_FN_ATTRS512 +_mm512_mask_reduce_max_epu64(__mmask8 __M, __m512i __V) { + __V = _mm512_maskz_mov_epi64(__M, __V); +#if (__clang_major__ < 14) + return __builtin_ia32_reduce_umax_q512(__V); +#else + return __builtin_reduce_max((__v8du)__V); +#endif +} + +static __inline__ long long __DEFAULT_FN_ATTRS512 +_mm512_mask_reduce_min_epi64(__mmask8 __M, __m512i __V) { + __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(__LONG_LONG_MAX__), __M, __V); +#if (__clang_major__ < 14) + return __builtin_ia32_reduce_smin_q512(__V); +#else + return __builtin_reduce_min((__v8di)__V); +#endif +} + +static __inline__ unsigned long long __DEFAULT_FN_ATTRS512 +_mm512_mask_reduce_min_epu64(__mmask8 __M, __m512i __V) { + __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(~0ULL), __M, __V); +#if (__clang_major__ < 14) + return __builtin_ia32_reduce_umin_q512(__V); +#else + return __builtin_reduce_min((__v8du)__V); +#endif +} +static __inline__ int __DEFAULT_FN_ATTRS512 +_mm512_reduce_max_epi32(__m512i __V) { +#if (__clang_major__ < 14) + return __builtin_ia32_reduce_smax_d512((__v16si)__V); +#else + return __builtin_reduce_max((__v16si)__V); +#endif +} + +static __inline__ unsigned int __DEFAULT_FN_ATTRS512 +_mm512_reduce_max_epu32(__m512i __V) { +#if (__clang_major__ < 14) + return __builtin_ia32_reduce_umax_d512((__v16si)__V); +#else + return __builtin_reduce_max((__v16su)__V); +#endif +} + +static __inline__ int __DEFAULT_FN_ATTRS512 +_mm512_reduce_min_epi32(__m512i __V) { +#if (__clang_major__ < 14) + return __builtin_ia32_reduce_smin_d512((__v16si)__V); +#else + return __builtin_reduce_min((__v16si)__V); +#endif +} + +static __inline__ unsigned int __DEFAULT_FN_ATTRS512 +_mm512_reduce_min_epu32(__m512i __V) { +#if (__clang_major__ < 14) + return __builtin_ia32_reduce_umin_d512((__v16si)__V); +#else + return __builtin_reduce_min((__v16su)__V); +#endif +} + +static __inline__ int __DEFAULT_FN_ATTRS512 +_mm512_mask_reduce_max_epi32(__mmask16 __M, __m512i __V) { + __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(-__INT_MAX__ - 1), __M, __V); +#if (__clang_major__ < 14) + return __builtin_ia32_reduce_smax_d512((__v16si)__V); +#else + return __builtin_reduce_max((__v16si)__V); +#endif +} + +static __inline__ unsigned int __DEFAULT_FN_ATTRS512 +_mm512_mask_reduce_max_epu32(__mmask16 __M, __m512i __V) { + __V = _mm512_maskz_mov_epi32(__M, __V); +#if (__clang_major__ < 14) + return __builtin_ia32_reduce_umax_d512((__v16si)__V); +#else + return __builtin_reduce_max((__v16su)__V); +#endif +} + +static __inline__ int __DEFAULT_FN_ATTRS512 +_mm512_mask_reduce_min_epi32(__mmask16 __M, __m512i __V) { + __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(__INT_MAX__), __M, __V); +#if (__clang_major__ < 14) + return __builtin_ia32_reduce_smin_d512((__v16si)__V); +#else + return __builtin_reduce_min((__v16si)__V); +#endif +} + +static __inline__ unsigned int __DEFAULT_FN_ATTRS512 +_mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __V) { + __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(~0U), __M, __V); +#if (__clang_major__ < 14) + return __builtin_ia32_reduce_umin_d512((__v16si)__V); +#else + return __builtin_reduce_min((__v16su)__V); +#endif +} + +static __inline__ double __DEFAULT_FN_ATTRS512 +_mm512_reduce_max_pd(__m512d __V) { + return __builtin_ia32_reduce_fmax_pd512(__V); +} + +static __inline__ double __DEFAULT_FN_ATTRS512 +_mm512_reduce_min_pd(__m512d __V) { + return __builtin_ia32_reduce_fmin_pd512(__V); +} + +static __inline__ double __DEFAULT_FN_ATTRS512 +_mm512_mask_reduce_max_pd(__mmask8 __M, __m512d __V) { + __V = _mm512_mask_mov_pd(_mm512_set1_pd(-__builtin_inf()), __M, __V); + return __builtin_ia32_reduce_fmax_pd512(__V); +} + +static __inline__ double __DEFAULT_FN_ATTRS512 +_mm512_mask_reduce_min_pd(__mmask8 __M, __m512d __V) { + __V = _mm512_mask_mov_pd(_mm512_set1_pd(__builtin_inf()), __M, __V); + return __builtin_ia32_reduce_fmin_pd512(__V); +} + +static __inline__ float __DEFAULT_FN_ATTRS512 +_mm512_reduce_max_ps(__m512 __V) { + return __builtin_ia32_reduce_fmax_ps512(__V); +} + +static __inline__ float __DEFAULT_FN_ATTRS512 +_mm512_reduce_min_ps(__m512 __V) { + return __builtin_ia32_reduce_fmin_ps512(__V); +} + +static __inline__ float __DEFAULT_FN_ATTRS512 +_mm512_mask_reduce_max_ps(__mmask16 __M, __m512 __V) { + __V = _mm512_mask_mov_ps(_mm512_set1_ps(-__builtin_inff()), __M, __V); + return __builtin_ia32_reduce_fmax_ps512(__V); +} + +static __inline__ float __DEFAULT_FN_ATTRS512 +_mm512_mask_reduce_min_ps(__mmask16 __M, __m512 __V) { + __V = _mm512_mask_mov_ps(_mm512_set1_ps(__builtin_inff()), __M, __V); + return __builtin_ia32_reduce_fmin_ps512(__V); +} + +/// Moves the least significant 32 bits of a vector of [16 x i32] to a +/// 32-bit signed integer value. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVD / MOVD instruction. +/// +/// \param __A +/// A vector of [16 x i32]. The least significant 32 bits are moved to the +/// destination. +/// \returns A 32-bit signed integer containing the moved value. +static __inline__ int __DEFAULT_FN_ATTRS512 +_mm512_cvtsi512_si32(__m512i __A) { + __v16si __b = (__v16si)__A; + return __b[0]; +} + +/// Loads 8 double-precision (64-bit) floating-point elements stored at memory +/// locations starting at location \a base_addr at packed 32-bit integer indices +/// stored in the lower half of \a vindex scaled by \a scale them in dst. +/// +/// This intrinsic corresponds to the VGATHERDPD instructions. +/// +/// \operation +/// FOR j := 0 to 7 +/// i := j*64 +/// m := j*32 +/// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 +/// dst[i+63:i] := MEM[addr+63:addr] +/// ENDFOR +/// dst[MAX:512] := 0 +/// \endoperation +#define _mm512_i32logather_pd(vindex, base_addr, scale) \ + _mm512_i32gather_pd(_mm512_castsi512_si256(vindex), (base_addr), (scale)) + +/// Loads 8 double-precision (64-bit) floating-point elements from memory +/// starting at location \a base_addr at packed 32-bit integer indices stored in +/// the lower half of \a vindex scaled by \a scale into dst using writemask +/// \a mask (elements are copied from \a src when the corresponding mask bit is +/// not set). +/// +/// This intrinsic corresponds to the VGATHERDPD instructions. +/// +/// \operation +/// FOR j := 0 to 7 +/// i := j*64 +/// m := j*32 +/// IF mask[j] +/// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 +/// dst[i+63:i] := MEM[addr+63:addr] +/// ELSE +/// dst[i+63:i] := src[i+63:i] +/// FI +/// ENDFOR +/// dst[MAX:512] := 0 +/// \endoperation +#define _mm512_mask_i32logather_pd(src, mask, vindex, base_addr, scale) \ + _mm512_mask_i32gather_pd((src), (mask), _mm512_castsi512_si256(vindex), \ + (base_addr), (scale)) + +/// Loads 8 64-bit integer elements from memory starting at location \a base_addr +/// at packed 32-bit integer indices stored in the lower half of \a vindex +/// scaled by \a scale and stores them in dst. +/// +/// This intrinsic corresponds to the VPGATHERDQ instructions. +/// +/// \operation +/// FOR j := 0 to 7 +/// i := j*64 +/// m := j*32 +/// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 +/// dst[i+63:i] := MEM[addr+63:addr] +/// ENDFOR +/// dst[MAX:512] := 0 +/// \endoperation +#define _mm512_i32logather_epi64(vindex, base_addr, scale) \ + _mm512_i32gather_epi64(_mm512_castsi512_si256(vindex), (base_addr), (scale)) + +/// Loads 8 64-bit integer elements from memory starting at location \a base_addr +/// at packed 32-bit integer indices stored in the lower half of \a vindex +/// scaled by \a scale and stores them in dst using writemask \a mask (elements +/// are copied from \a src when the corresponding mask bit is not set). +/// +/// This intrinsic corresponds to the VPGATHERDQ instructions. +/// +/// \operation +/// FOR j := 0 to 7 +/// i := j*64 +/// m := j*32 +/// IF mask[j] +/// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 +/// dst[i+63:i] := MEM[addr+63:addr] +/// ELSE +/// dst[i+63:i] := src[i+63:i] +/// FI +/// ENDFOR +/// dst[MAX:512] := 0 +/// \endoperation +#define _mm512_mask_i32logather_epi64(src, mask, vindex, base_addr, scale) \ + _mm512_mask_i32gather_epi64((src), (mask), _mm512_castsi512_si256(vindex), \ + (base_addr), (scale)) + +/// Stores 8 packed double-precision (64-bit) floating-point elements in \a v1 +/// and to memory locations starting at location \a base_addr at packed 32-bit +/// integer indices stored in \a vindex scaled by \a scale. +/// +/// This intrinsic corresponds to the VSCATTERDPD instructions. +/// +/// \operation +/// FOR j := 0 to 7 +/// i := j*64 +/// m := j*32 +/// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 +/// MEM[addr+63:addr] := v1[i+63:i] +/// ENDFOR +/// \endoperation +#define _mm512_i32loscatter_pd(base_addr, vindex, v1, scale) \ + _mm512_i32scatter_pd((base_addr), _mm512_castsi512_si256(vindex), (v1), (scale)) + +/// Stores 8 packed double-precision (64-bit) floating-point elements in \a v1 +/// to memory locations starting at location \a base_addr at packed 32-bit +/// integer indices stored in \a vindex scaled by \a scale. Only those elements +/// whose corresponding mask bit is set in writemask \a mask are written to +/// memory. +/// +/// This intrinsic corresponds to the VSCATTERDPD instructions. +/// +/// \operation +/// FOR j := 0 to 7 +/// i := j*64 +/// m := j*32 +/// IF mask[j] +/// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 +/// MEM[addr+63:addr] := a[i+63:i] +/// FI +/// ENDFOR +/// \endoperation +#define _mm512_mask_i32loscatter_pd(base_addr, mask, vindex, v1, scale) \ + _mm512_mask_i32scatter_pd((base_addr), (mask), \ + _mm512_castsi512_si256(vindex), (v1), (scale)) + +/// Stores 8 packed 64-bit integer elements located in \a v1 and stores them in +/// memory locations starting at location \a base_addr at packed 32-bit integer +/// indices stored in \a vindex scaled by \a scale. +/// +/// This intrinsic corresponds to the VPSCATTERDQ instructions. +/// +/// \operation +/// FOR j := 0 to 7 +/// i := j*64 +/// m := j*32 +/// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 +/// MEM[addr+63:addr] := a[i+63:i] +/// ENDFOR +/// \endoperation +#define _mm512_i32loscatter_epi64(base_addr, vindex, v1, scale) \ + _mm512_i32scatter_epi64((base_addr), \ + _mm512_castsi512_si256(vindex), (v1), (scale)) + +/// Stores 8 packed 64-bit integer elements located in a and stores them in +/// memory locations starting at location \a base_addr at packed 32-bit integer +/// indices stored in \a vindex scaled by scale using writemask \a mask (elements +/// whose corresponding mask bit is not set are not written to memory). +/// +/// This intrinsic corresponds to the VPSCATTERDQ instructions. +/// +/// \operation +/// FOR j := 0 to 7 +/// i := j*64 +/// m := j*32 +/// IF mask[j] +/// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 +/// MEM[addr+63:addr] := a[i+63:i] +/// FI +/// ENDFOR +/// \endoperation +#define _mm512_mask_i32loscatter_epi64(base_addr, mask, vindex, v1, scale) \ + _mm512_mask_i32scatter_epi64((base_addr), (mask), \ + _mm512_castsi512_si256(vindex), (v1), (scale)) + +#undef __DEFAULT_FN_ATTRS512 +#undef __DEFAULT_FN_ATTRS128 +#undef __DEFAULT_FN_ATTRS + +#endif /* __AVX512FINTRIN_H */ diff --git a/include-llvm/avx512fp16intrin.h b/include-llvm/avx512fp16intrin.h new file mode 100644 index 0000000..99409a3 --- /dev/null +++ b/include-llvm/avx512fp16intrin.h @@ -0,0 +1,3349 @@ +/*===----------- avx512fp16intrin.h - AVX512-FP16 intrinsics ---------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __AVX512FP16INTRIN_H +#define __AVX512FP16INTRIN_H + +/* Define the default attributes for the functions in this file. */ +typedef _Float16 __v32hf __attribute__((__vector_size__(64), __aligned__(64))); +typedef _Float16 __m512h __attribute__((__vector_size__(64), __aligned__(64))); +typedef _Float16 __m512h_u __attribute__((__vector_size__(64), __aligned__(1))); +typedef _Float16 __v8hf __attribute__((__vector_size__(16), __aligned__(16))); +typedef _Float16 __m128h __attribute__((__vector_size__(16), __aligned__(16))); +typedef _Float16 __m128h_u __attribute__((__vector_size__(16), __aligned__(1))); +typedef _Float16 __v16hf __attribute__((__vector_size__(32), __aligned__(32))); +typedef _Float16 __m256h __attribute__((__vector_size__(32), __aligned__(32))); +typedef _Float16 __m256h_u __attribute__((__vector_size__(32), __aligned__(1))); + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS512 \ + __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"), \ + __min_vector_width__(512))) +#define __DEFAULT_FN_ATTRS256 \ + __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"), \ + __min_vector_width__(256))) +#define __DEFAULT_FN_ATTRS128 \ + __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"), \ + __min_vector_width__(128))) + +static __inline__ _Float16 __DEFAULT_FN_ATTRS512 _mm512_cvtsh_h(__m512h __a) { + return __a[0]; +} + +static __inline __m128h __DEFAULT_FN_ATTRS128 _mm_setzero_ph(void) { + return (__m128h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; +} + +static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_setzero_ph(void) { + return (__m256h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_undefined_ph(void) { + return (__m256h)__builtin_ia32_undef256(); +} + +static __inline __m512h __DEFAULT_FN_ATTRS512 _mm512_setzero_ph(void) { + return (__m512h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, + 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_undefined_ph(void) { + return (__m128h)__builtin_ia32_undef128(); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_undefined_ph(void) { + return (__m512h)__builtin_ia32_undef512(); +} + +static __inline __m512h __DEFAULT_FN_ATTRS512 _mm512_set1_ph(_Float16 __h) { + return (__m512h)(__v32hf){__h, __h, __h, __h, __h, __h, __h, __h, + __h, __h, __h, __h, __h, __h, __h, __h, + __h, __h, __h, __h, __h, __h, __h, __h, + __h, __h, __h, __h, __h, __h, __h, __h}; +} + +static __inline __m512h __DEFAULT_FN_ATTRS512 +_mm512_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4, + _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8, + _Float16 __h9, _Float16 __h10, _Float16 __h11, _Float16 __h12, + _Float16 __h13, _Float16 __h14, _Float16 __h15, _Float16 __h16, + _Float16 __h17, _Float16 __h18, _Float16 __h19, _Float16 __h20, + _Float16 __h21, _Float16 __h22, _Float16 __h23, _Float16 __h24, + _Float16 __h25, _Float16 __h26, _Float16 __h27, _Float16 __h28, + _Float16 __h29, _Float16 __h30, _Float16 __h31, _Float16 __h32) { + return (__m512h)(__v32hf){__h32, __h31, __h30, __h29, __h28, __h27, __h26, + __h25, __h24, __h23, __h22, __h21, __h20, __h19, + __h18, __h17, __h16, __h15, __h14, __h13, __h12, + __h11, __h10, __h9, __h8, __h7, __h6, __h5, + __h4, __h3, __h2, __h1}; +} + +#define _mm512_setr_ph(h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11, h12, h13, \ + h14, h15, h16, h17, h18, h19, h20, h21, h22, h23, h24, \ + h25, h26, h27, h28, h29, h30, h31, h32) \ + _mm512_set_ph((h32), (h31), (h30), (h29), (h28), (h27), (h26), (h25), (h24), \ + (h23), (h22), (h21), (h20), (h19), (h18), (h17), (h16), (h15), \ + (h14), (h13), (h12), (h11), (h10), (h9), (h8), (h7), (h6), \ + (h5), (h4), (h3), (h2), (h1)) + +static __inline __m512h __DEFAULT_FN_ATTRS512 +_mm512_set1_pch(_Float16 _Complex h) { + return (__m512h)_mm512_set1_ps(__builtin_bit_cast(float, h)); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_castph_ps(__m128h __a) { + return (__m128)__a; +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_castph_ps(__m256h __a) { + return (__m256)__a; +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_castph_ps(__m512h __a) { + return (__m512)__a; +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_castph_pd(__m128h __a) { + return (__m128d)__a; +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_castph_pd(__m256h __a) { + return (__m256d)__a; +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_castph_pd(__m512h __a) { + return (__m512d)__a; +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_castph_si128(__m128h __a) { + return (__m128i)__a; +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_castph_si256(__m256h __a) { + return (__m256i)__a; +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_castph_si512(__m512h __a) { + return (__m512i)__a; +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castps_ph(__m128 __a) { + return (__m128h)__a; +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_castps_ph(__m256 __a) { + return (__m256h)__a; +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_castps_ph(__m512 __a) { + return (__m512h)__a; +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castpd_ph(__m128d __a) { + return (__m128h)__a; +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_castpd_ph(__m256d __a) { + return (__m256h)__a; +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_castpd_ph(__m512d __a) { + return (__m512h)__a; +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castsi128_ph(__m128i __a) { + return (__m128h)__a; +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_castsi256_ph(__m256i __a) { + return (__m256h)__a; +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_castsi512_ph(__m512i __a) { + return (__m512h)__a; +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS256 +_mm256_castph256_ph128(__m256h __a) { + return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS512 +_mm512_castph512_ph128(__m512h __a) { + return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS512 +_mm512_castph512_ph256(__m512h __a) { + return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_castph128_ph256(__m128h __a) { + return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1, + -1, -1, -1, -1, -1); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_castph128_ph512(__m128h __a) { + return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_castph256_ph512(__m256h __a) { + return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1); +} + +/// Constructs a 256-bit floating-point vector of [16 x half] from a +/// 128-bit floating-point vector of [8 x half]. The lower 128 bits +/// contain the value of the source vector. The upper 384 bits are set +/// to zero. +/// +/// \headerfile +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 128-bit vector of [8 x half]. +/// \returns A 512-bit floating-point vector of [16 x half]. The lower 128 bits +/// contain the value of the parameter. The upper 384 bits are set to zero. +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_zextph128_ph256(__m128h __a) { + return __builtin_shufflevector(__a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4, + 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); +} + +/// Constructs a 512-bit floating-point vector of [32 x half] from a +/// 128-bit floating-point vector of [8 x half]. The lower 128 bits +/// contain the value of the source vector. The upper 384 bits are set +/// to zero. +/// +/// \headerfile +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 128-bit vector of [8 x half]. +/// \returns A 512-bit floating-point vector of [32 x half]. The lower 128 bits +/// contain the value of the parameter. The upper 384 bits are set to zero. +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_zextph128_ph512(__m128h __a) { + return __builtin_shufflevector( + __a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, + 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15); +} + +/// Constructs a 512-bit floating-point vector of [32 x half] from a +/// 256-bit floating-point vector of [16 x half]. The lower 256 bits +/// contain the value of the source vector. The upper 256 bits are set +/// to zero. +/// +/// \headerfile +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 256-bit vector of [16 x half]. +/// \returns A 512-bit floating-point vector of [32 x half]. The lower 256 bits +/// contain the value of the parameter. The upper 256 bits are set to zero. +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_zextph256_ph512(__m256h __a) { + return __builtin_shufflevector(__a, (__v16hf)_mm256_setzero_ph(), 0, 1, 2, 3, + 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, + 29, 30, 31); +} + +#define _mm_comi_round_sh(A, B, P, R) \ + __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, (int)(P), (int)(R)) + +#define _mm_comi_sh(A, B, pred) \ + _mm_comi_round_sh((A), (B), (pred), _MM_FROUND_CUR_DIRECTION) + +static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comieq_sh(__m128h A, + __m128h B) { + return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_EQ_OS, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comilt_sh(__m128h A, + __m128h B) { + return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LT_OS, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comile_sh(__m128h A, + __m128h B) { + return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LE_OS, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comigt_sh(__m128h A, + __m128h B) { + return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GT_OS, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comige_sh(__m128h A, + __m128h B) { + return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GE_OS, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comineq_sh(__m128h A, + __m128h B) { + return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_NEQ_US, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomieq_sh(__m128h A, + __m128h B) { + return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_EQ_OQ, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomilt_sh(__m128h A, + __m128h B) { + return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LT_OQ, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomile_sh(__m128h A, + __m128h B) { + return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LE_OQ, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomigt_sh(__m128h A, + __m128h B) { + return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GT_OQ, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomige_sh(__m128h A, + __m128h B) { + return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GE_OQ, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomineq_sh(__m128h A, + __m128h B) { + return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_NEQ_UQ, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_add_ph(__m512h __A, + __m512h __B) { + return (__m512h)((__v32hf)__A + (__v32hf)__B); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_add_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { + return (__m512h)__builtin_ia32_selectph_512( + (__mmask32)__U, (__v32hf)_mm512_add_ph(__A, __B), (__v32hf)__W); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_maskz_add_ph(__mmask32 __U, __m512h __A, __m512h __B) { + return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, + (__v32hf)_mm512_add_ph(__A, __B), + (__v32hf)_mm512_setzero_ph()); +} + +#define _mm512_add_round_ph(A, B, R) \ + ((__m512h)__builtin_ia32_addph512((__v32hf)(__m512h)(A), \ + (__v32hf)(__m512h)(B), (int)(R))) + +#define _mm512_mask_add_round_ph(W, U, A, B, R) \ + ((__m512h)__builtin_ia32_selectph_512( \ + (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)), \ + (__v32hf)(__m512h)(W))) + +#define _mm512_maskz_add_round_ph(U, A, B, R) \ + ((__m512h)__builtin_ia32_selectph_512( \ + (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)), \ + (__v32hf)_mm512_setzero_ph())) + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_sub_ph(__m512h __A, + __m512h __B) { + return (__m512h)((__v32hf)__A - (__v32hf)__B); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_sub_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { + return (__m512h)__builtin_ia32_selectph_512( + (__mmask32)__U, (__v32hf)_mm512_sub_ph(__A, __B), (__v32hf)__W); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_maskz_sub_ph(__mmask32 __U, __m512h __A, __m512h __B) { + return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, + (__v32hf)_mm512_sub_ph(__A, __B), + (__v32hf)_mm512_setzero_ph()); +} + +#define _mm512_sub_round_ph(A, B, R) \ + ((__m512h)__builtin_ia32_subph512((__v32hf)(__m512h)(A), \ + (__v32hf)(__m512h)(B), (int)(R))) + +#define _mm512_mask_sub_round_ph(W, U, A, B, R) \ + ((__m512h)__builtin_ia32_selectph_512( \ + (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)), \ + (__v32hf)(__m512h)(W))) + +#define _mm512_maskz_sub_round_ph(U, A, B, R) \ + ((__m512h)__builtin_ia32_selectph_512( \ + (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)), \ + (__v32hf)_mm512_setzero_ph())) + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_mul_ph(__m512h __A, + __m512h __B) { + return (__m512h)((__v32hf)__A * (__v32hf)__B); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_mul_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { + return (__m512h)__builtin_ia32_selectph_512( + (__mmask32)__U, (__v32hf)_mm512_mul_ph(__A, __B), (__v32hf)__W); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_maskz_mul_ph(__mmask32 __U, __m512h __A, __m512h __B) { + return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, + (__v32hf)_mm512_mul_ph(__A, __B), + (__v32hf)_mm512_setzero_ph()); +} + +#define _mm512_mul_round_ph(A, B, R) \ + ((__m512h)__builtin_ia32_mulph512((__v32hf)(__m512h)(A), \ + (__v32hf)(__m512h)(B), (int)(R))) + +#define _mm512_mask_mul_round_ph(W, U, A, B, R) \ + ((__m512h)__builtin_ia32_selectph_512( \ + (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)), \ + (__v32hf)(__m512h)(W))) + +#define _mm512_maskz_mul_round_ph(U, A, B, R) \ + ((__m512h)__builtin_ia32_selectph_512( \ + (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)), \ + (__v32hf)_mm512_setzero_ph())) + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_div_ph(__m512h __A, + __m512h __B) { + return (__m512h)((__v32hf)__A / (__v32hf)__B); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_div_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { + return (__m512h)__builtin_ia32_selectph_512( + (__mmask32)__U, (__v32hf)_mm512_div_ph(__A, __B), (__v32hf)__W); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_maskz_div_ph(__mmask32 __U, __m512h __A, __m512h __B) { + return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, + (__v32hf)_mm512_div_ph(__A, __B), + (__v32hf)_mm512_setzero_ph()); +} + +#define _mm512_div_round_ph(A, B, R) \ + ((__m512h)__builtin_ia32_divph512((__v32hf)(__m512h)(A), \ + (__v32hf)(__m512h)(B), (int)(R))) + +#define _mm512_mask_div_round_ph(W, U, A, B, R) \ + ((__m512h)__builtin_ia32_selectph_512( \ + (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)), \ + (__v32hf)(__m512h)(W))) + +#define _mm512_maskz_div_round_ph(U, A, B, R) \ + ((__m512h)__builtin_ia32_selectph_512( \ + (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)), \ + (__v32hf)_mm512_setzero_ph())) + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_min_ph(__m512h __A, + __m512h __B) { + return (__m512h)__builtin_ia32_minph512((__v32hf)__A, (__v32hf)__B, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_min_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { + return (__m512h)__builtin_ia32_selectph_512( + (__mmask32)__U, (__v32hf)_mm512_min_ph(__A, __B), (__v32hf)__W); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_maskz_min_ph(__mmask32 __U, __m512h __A, __m512h __B) { + return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, + (__v32hf)_mm512_min_ph(__A, __B), + (__v32hf)_mm512_setzero_ph()); +} + +#define _mm512_min_round_ph(A, B, R) \ + ((__m512h)__builtin_ia32_minph512((__v32hf)(__m512h)(A), \ + (__v32hf)(__m512h)(B), (int)(R))) + +#define _mm512_mask_min_round_ph(W, U, A, B, R) \ + ((__m512h)__builtin_ia32_selectph_512( \ + (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)), \ + (__v32hf)(__m512h)(W))) + +#define _mm512_maskz_min_round_ph(U, A, B, R) \ + ((__m512h)__builtin_ia32_selectph_512( \ + (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)), \ + (__v32hf)_mm512_setzero_ph())) + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_max_ph(__m512h __A, + __m512h __B) { + return (__m512h)__builtin_ia32_maxph512((__v32hf)__A, (__v32hf)__B, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_max_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { + return (__m512h)__builtin_ia32_selectph_512( + (__mmask32)__U, (__v32hf)_mm512_max_ph(__A, __B), (__v32hf)__W); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_maskz_max_ph(__mmask32 __U, __m512h __A, __m512h __B) { + return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, + (__v32hf)_mm512_max_ph(__A, __B), + (__v32hf)_mm512_setzero_ph()); +} + +#define _mm512_max_round_ph(A, B, R) \ + ((__m512h)__builtin_ia32_maxph512((__v32hf)(__m512h)(A), \ + (__v32hf)(__m512h)(B), (int)(R))) + +#define _mm512_mask_max_round_ph(W, U, A, B, R) \ + ((__m512h)__builtin_ia32_selectph_512( \ + (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)), \ + (__v32hf)(__m512h)(W))) + +#define _mm512_maskz_max_round_ph(U, A, B, R) \ + ((__m512h)__builtin_ia32_selectph_512( \ + (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)), \ + (__v32hf)_mm512_setzero_ph())) + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_abs_ph(__m512h __A) { + return (__m512h)_mm512_and_epi32(_mm512_set1_epi32(0x7FFF7FFF), (__m512i)__A); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_conj_pch(__m512h __A) { + return (__m512h)_mm512_xor_ps((__m512)__A, _mm512_set1_ps(-0.0f)); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_conj_pch(__m512h __W, __mmask16 __U, __m512h __A) { + return (__m512h)__builtin_ia32_selectps_512( + (__mmask16)__U, (__v16sf)_mm512_conj_pch(__A), (__v16sf)__W); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_maskz_conj_pch(__mmask16 __U, __m512h __A) { + return (__m512h)__builtin_ia32_selectps_512((__mmask16)__U, + (__v16sf)_mm512_conj_pch(__A), + (__v16sf)_mm512_setzero_ps()); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_add_sh(__m128h __A, + __m128h __B) { + __A[0] += __B[0]; + return __A; +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_add_sh(__m128h __W, + __mmask8 __U, + __m128h __A, + __m128h __B) { + __A = _mm_add_sh(__A, __B); + return __builtin_ia32_selectsh_128(__U, __A, __W); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_add_sh(__mmask8 __U, + __m128h __A, + __m128h __B) { + __A = _mm_add_sh(__A, __B); + return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph()); +} + +#define _mm_add_round_sh(A, B, R) \ + ((__m128h)__builtin_ia32_addsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)-1, (int)(R))) + +#define _mm_mask_add_round_sh(W, U, A, B, R) \ + ((__m128h)__builtin_ia32_addsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm_maskz_add_round_sh(U, A, B, R) \ + ((__m128h)__builtin_ia32_addsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sub_sh(__m128h __A, + __m128h __B) { + __A[0] -= __B[0]; + return __A; +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sub_sh(__m128h __W, + __mmask8 __U, + __m128h __A, + __m128h __B) { + __A = _mm_sub_sh(__A, __B); + return __builtin_ia32_selectsh_128(__U, __A, __W); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sub_sh(__mmask8 __U, + __m128h __A, + __m128h __B) { + __A = _mm_sub_sh(__A, __B); + return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph()); +} + +#define _mm_sub_round_sh(A, B, R) \ + ((__m128h)__builtin_ia32_subsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)-1, (int)(R))) + +#define _mm_mask_sub_round_sh(W, U, A, B, R) \ + ((__m128h)__builtin_ia32_subsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm_maskz_sub_round_sh(U, A, B, R) \ + ((__m128h)__builtin_ia32_subsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mul_sh(__m128h __A, + __m128h __B) { + __A[0] *= __B[0]; + return __A; +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_mul_sh(__m128h __W, + __mmask8 __U, + __m128h __A, + __m128h __B) { + __A = _mm_mul_sh(__A, __B); + return __builtin_ia32_selectsh_128(__U, __A, __W); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_mul_sh(__mmask8 __U, + __m128h __A, + __m128h __B) { + __A = _mm_mul_sh(__A, __B); + return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph()); +} + +#define _mm_mul_round_sh(A, B, R) \ + ((__m128h)__builtin_ia32_mulsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)-1, (int)(R))) + +#define _mm_mask_mul_round_sh(W, U, A, B, R) \ + ((__m128h)__builtin_ia32_mulsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm_maskz_mul_round_sh(U, A, B, R) \ + ((__m128h)__builtin_ia32_mulsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_div_sh(__m128h __A, + __m128h __B) { + __A[0] /= __B[0]; + return __A; +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_div_sh(__m128h __W, + __mmask8 __U, + __m128h __A, + __m128h __B) { + __A = _mm_div_sh(__A, __B); + return __builtin_ia32_selectsh_128(__U, __A, __W); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_div_sh(__mmask8 __U, + __m128h __A, + __m128h __B) { + __A = _mm_div_sh(__A, __B); + return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph()); +} + +#define _mm_div_round_sh(A, B, R) \ + ((__m128h)__builtin_ia32_divsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)-1, (int)(R))) + +#define _mm_mask_div_round_sh(W, U, A, B, R) \ + ((__m128h)__builtin_ia32_divsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm_maskz_div_round_sh(U, A, B, R) \ + ((__m128h)__builtin_ia32_divsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_min_sh(__m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_minsh_round_mask( + (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_min_sh(__m128h __W, + __mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_minsh_round_mask((__v8hf)__A, (__v8hf)__B, + (__v8hf)__W, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_min_sh(__mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_minsh_round_mask( + (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_min_round_sh(A, B, R) \ + ((__m128h)__builtin_ia32_minsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)-1, (int)(R))) + +#define _mm_mask_min_round_sh(W, U, A, B, R) \ + ((__m128h)__builtin_ia32_minsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm_maskz_min_round_sh(U, A, B, R) \ + ((__m128h)__builtin_ia32_minsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_max_sh(__m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_maxsh_round_mask( + (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_max_sh(__m128h __W, + __mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_maxsh_round_mask((__v8hf)__A, (__v8hf)__B, + (__v8hf)__W, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_max_sh(__mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_maxsh_round_mask( + (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_max_round_sh(A, B, R) \ + ((__m128h)__builtin_ia32_maxsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)-1, (int)(R))) + +#define _mm_mask_max_round_sh(W, U, A, B, R) \ + ((__m128h)__builtin_ia32_maxsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm_maskz_max_round_sh(U, A, B, R) \ + ((__m128h)__builtin_ia32_maxsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)(U), (int)(R))) + +#define _mm512_cmp_round_ph_mask(A, B, P, R) \ + ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A), \ + (__v32hf)(__m512h)(B), (int)(P), \ + (__mmask32)-1, (int)(R))) + +#define _mm512_mask_cmp_round_ph_mask(U, A, B, P, R) \ + ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A), \ + (__v32hf)(__m512h)(B), (int)(P), \ + (__mmask32)(U), (int)(R))) + +#define _mm512_cmp_ph_mask(A, B, P) \ + _mm512_cmp_round_ph_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION) + +#define _mm512_mask_cmp_ph_mask(U, A, B, P) \ + _mm512_mask_cmp_round_ph_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION) + +#define _mm_cmp_round_sh_mask(X, Y, P, R) \ + ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X), \ + (__v8hf)(__m128h)(Y), (int)(P), \ + (__mmask8)-1, (int)(R))) + +#define _mm_mask_cmp_round_sh_mask(M, X, Y, P, R) \ + ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X), \ + (__v8hf)(__m128h)(Y), (int)(P), \ + (__mmask8)(M), (int)(R))) + +#define _mm_cmp_sh_mask(X, Y, P) \ + ((__mmask8)__builtin_ia32_cmpsh_mask( \ + (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_cmp_sh_mask(M, X, Y, P) \ + ((__mmask8)__builtin_ia32_cmpsh_mask( \ + (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)(M), \ + _MM_FROUND_CUR_DIRECTION)) +// loads with vmovsh: +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_load_sh(void const *__dp) { + struct __mm_load_sh_struct { + _Float16 __u; + } __attribute__((__packed__, __may_alias__)); + _Float16 __u = ((struct __mm_load_sh_struct *)__dp)->__u; + return (__m128h){__u, 0, 0, 0, 0, 0, 0, 0}; +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask_load_sh(__m128h __W, __mmask8 __U, const void *__A) { + __m128h src = (__v8hf)__builtin_shufflevector( + (__v8hf)__W, (__v8hf)_mm_setzero_ph(), 0, 8, 8, 8, 8, 8, 8, 8); + + return (__m128h)__builtin_ia32_loadsh128_mask((__v8hf *)__A, src, __U & 1); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_load_sh(__mmask8 __U, const void *__A) { + return (__m128h)__builtin_ia32_loadsh128_mask( + (__v8hf *)__A, (__v8hf)_mm_setzero_ph(), __U & 1); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_load_ph(void const *__p) { + return *(const __m512h *)__p; +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_load_ph(void const *__p) { + return *(const __m256h *)__p; +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_load_ph(void const *__p) { + return *(const __m128h *)__p; +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_loadu_ph(void const *__p) { + struct __loadu_ph { + __m512h_u __v; + } __attribute__((__packed__, __may_alias__)); + return ((const struct __loadu_ph *)__p)->__v; +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_loadu_ph(void const *__p) { + struct __loadu_ph { + __m256h_u __v; + } __attribute__((__packed__, __may_alias__)); + return ((const struct __loadu_ph *)__p)->__v; +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_loadu_ph(void const *__p) { + struct __loadu_ph { + __m128h_u __v; + } __attribute__((__packed__, __may_alias__)); + return ((const struct __loadu_ph *)__p)->__v; +} + +// stores with vmovsh: +static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_sh(void *__dp, + __m128h __a) { + struct __mm_store_sh_struct { + _Float16 __u; + } __attribute__((__packed__, __may_alias__)); + ((struct __mm_store_sh_struct *)__dp)->__u = __a[0]; +} + +static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_store_sh(void *__W, + __mmask8 __U, + __m128h __A) { + __builtin_ia32_storesh128_mask((__v8hf *)__W, __A, __U & 1); +} + +static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_store_ph(void *__P, + __m512h __A) { + *(__m512h *)__P = __A; +} + +static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_store_ph(void *__P, + __m256h __A) { + *(__m256h *)__P = __A; +} + +static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_ph(void *__P, + __m128h __A) { + *(__m128h *)__P = __A; +} + +static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_storeu_ph(void *__P, + __m512h __A) { + struct __storeu_ph { + __m512h_u __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_ph *)__P)->__v = __A; +} + +static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_storeu_ph(void *__P, + __m256h __A) { + struct __storeu_ph { + __m256h_u __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_ph *)__P)->__v = __A; +} + +static __inline__ void __DEFAULT_FN_ATTRS128 _mm_storeu_ph(void *__P, + __m128h __A) { + struct __storeu_ph { + __m128h_u __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_ph *)__P)->__v = __A; +} + +// moves with vmovsh: +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_move_sh(__m128h __a, + __m128h __b) { + __a[0] = __b[0]; + return __a; +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_move_sh(__m128h __W, + __mmask8 __U, + __m128h __A, + __m128h __B) { + return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B), __W); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_move_sh(__mmask8 __U, + __m128h __A, + __m128h __B) { + return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B), + _mm_setzero_ph()); +} + +// vmovw: +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtsi16_si128(short __a) { + return (__m128i)(__v8hi){__a, 0, 0, 0, 0, 0, 0, 0}; +} + +static __inline__ short __DEFAULT_FN_ATTRS128 _mm_cvtsi128_si16(__m128i __a) { + __v8hi __b = (__v8hi)__a; + return __b[0]; +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_rcp_ph(__m512h __A) { + return (__m512h)__builtin_ia32_rcpph512_mask( + (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_rcp_ph(__m512h __W, __mmask32 __U, __m512h __A) { + return (__m512h)__builtin_ia32_rcpph512_mask((__v32hf)__A, (__v32hf)__W, + (__mmask32)__U); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_maskz_rcp_ph(__mmask32 __U, __m512h __A) { + return (__m512h)__builtin_ia32_rcpph512_mask( + (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_rsqrt_ph(__m512h __A) { + return (__m512h)__builtin_ia32_rsqrtph512_mask( + (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_rsqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) { + return (__m512h)__builtin_ia32_rsqrtph512_mask((__v32hf)__A, (__v32hf)__W, + (__mmask32)__U); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_maskz_rsqrt_ph(__mmask32 __U, __m512h __A) { + return (__m512h)__builtin_ia32_rsqrtph512_mask( + (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U); +} + +#define _mm512_getmant_ph(A, B, C) \ + ((__m512h)__builtin_ia32_getmantph512_mask( \ + (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \ + (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_getmant_ph(W, U, A, B, C) \ + ((__m512h)__builtin_ia32_getmantph512_mask( \ + (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W), \ + (__mmask32)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_maskz_getmant_ph(U, A, B, C) \ + ((__m512h)__builtin_ia32_getmantph512_mask( \ + (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \ + (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_getmant_round_ph(A, B, C, R) \ + ((__m512h)__builtin_ia32_getmantph512_mask( \ + (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \ + (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R))) + +#define _mm512_mask_getmant_round_ph(W, U, A, B, C, R) \ + ((__m512h)__builtin_ia32_getmantph512_mask( \ + (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W), \ + (__mmask32)(U), (int)(R))) + +#define _mm512_maskz_getmant_round_ph(U, A, B, C, R) \ + ((__m512h)__builtin_ia32_getmantph512_mask( \ + (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \ + (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R))) + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_getexp_ph(__m512h __A) { + return (__m512h)__builtin_ia32_getexpph512_mask( + (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_getexp_ph(__m512h __W, __mmask32 __U, __m512h __A) { + return (__m512h)__builtin_ia32_getexpph512_mask( + (__v32hf)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_maskz_getexp_ph(__mmask32 __U, __m512h __A) { + return (__m512h)__builtin_ia32_getexpph512_mask( + (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_getexp_round_ph(A, R) \ + ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \ + (__v32hf)_mm512_undefined_ph(), \ + (__mmask32)-1, (int)(R))) + +#define _mm512_mask_getexp_round_ph(W, U, A, R) \ + ((__m512h)__builtin_ia32_getexpph512_mask( \ + (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(W), (__mmask32)(U), (int)(R))) + +#define _mm512_maskz_getexp_round_ph(U, A, R) \ + ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \ + (__v32hf)_mm512_setzero_ph(), \ + (__mmask32)(U), (int)(R))) + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_scalef_ph(__m512h __A, + __m512h __B) { + return (__m512h)__builtin_ia32_scalefph512_mask( + (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_scalef_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { + return (__m512h)__builtin_ia32_scalefph512_mask((__v32hf)__A, (__v32hf)__B, + (__v32hf)__W, (__mmask32)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_maskz_scalef_ph(__mmask32 __U, __m512h __A, __m512h __B) { + return (__m512h)__builtin_ia32_scalefph512_mask( + (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_scalef_round_ph(A, B, R) \ + ((__m512h)__builtin_ia32_scalefph512_mask( \ + (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), \ + (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R))) + +#define _mm512_mask_scalef_round_ph(W, U, A, B, R) \ + ((__m512h)__builtin_ia32_scalefph512_mask( \ + (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(W), \ + (__mmask32)(U), (int)(R))) + +#define _mm512_maskz_scalef_round_ph(U, A, B, R) \ + ((__m512h)__builtin_ia32_scalefph512_mask( \ + (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), \ + (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R))) + +#define _mm512_roundscale_ph(A, B) \ + ((__m512h)__builtin_ia32_rndscaleph_mask( \ + (__v32hf)(__m512h)(A), (int)(B), (__v32hf)(__m512h)(A), (__mmask32)-1, \ + _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_roundscale_ph(A, B, C, imm) \ + ((__m512h)__builtin_ia32_rndscaleph_mask( \ + (__v32hf)(__m512h)(C), (int)(imm), (__v32hf)(__m512h)(A), \ + (__mmask32)(B), _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_maskz_roundscale_ph(A, B, imm) \ + ((__m512h)__builtin_ia32_rndscaleph_mask( \ + (__v32hf)(__m512h)(B), (int)(imm), (__v32hf)_mm512_setzero_ph(), \ + (__mmask32)(A), _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_roundscale_round_ph(A, B, C, imm, R) \ + ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(C), (int)(imm), \ + (__v32hf)(__m512h)(A), \ + (__mmask32)(B), (int)(R))) + +#define _mm512_maskz_roundscale_round_ph(A, B, imm, R) \ + ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(B), (int)(imm), \ + (__v32hf)_mm512_setzero_ph(), \ + (__mmask32)(A), (int)(R))) + +#define _mm512_roundscale_round_ph(A, imm, R) \ + ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(A), (int)(imm), \ + (__v32hf)_mm512_undefined_ph(), \ + (__mmask32)-1, (int)(R))) + +#define _mm512_reduce_ph(A, imm) \ + ((__m512h)__builtin_ia32_reduceph512_mask( \ + (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_undefined_ph(), \ + (__mmask32)-1, _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_reduce_ph(W, U, A, imm) \ + ((__m512h)__builtin_ia32_reduceph512_mask( \ + (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)(__m512h)(W), \ + (__mmask32)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_maskz_reduce_ph(U, A, imm) \ + ((__m512h)__builtin_ia32_reduceph512_mask( \ + (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_setzero_ph(), \ + (__mmask32)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm512_mask_reduce_round_ph(W, U, A, imm, R) \ + ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \ + (__v32hf)(__m512h)(W), \ + (__mmask32)(U), (int)(R))) + +#define _mm512_maskz_reduce_round_ph(U, A, imm, R) \ + ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \ + (__v32hf)_mm512_setzero_ph(), \ + (__mmask32)(U), (int)(R))) + +#define _mm512_reduce_round_ph(A, imm, R) \ + ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \ + (__v32hf)_mm512_undefined_ph(), \ + (__mmask32)-1, (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rcp_sh(__m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_rcpsh_mask( + (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rcp_sh(__m128h __W, + __mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_rcpsh_mask((__v8hf)__A, (__v8hf)__B, + (__v8hf)__W, (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_rcp_sh(__mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_rcpsh_mask( + (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rsqrt_sh(__m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_rsqrtsh_mask( + (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rsqrt_sh(__m128h __W, + __mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_rsqrtsh_mask((__v8hf)__A, (__v8hf)__B, + (__v8hf)__W, (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_rsqrt_sh(__mmask8 __U, __m128h __A, __m128h __B) { + return (__m128h)__builtin_ia32_rsqrtsh_mask( + (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); +} + +#define _mm_getmant_round_sh(A, B, C, D, R) \ + ((__m128h)__builtin_ia32_getmantsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \ + (__v8hf)_mm_setzero_ph(), (__mmask8)-1, (int)(R))) + +#define _mm_getmant_sh(A, B, C, D) \ + ((__m128h)__builtin_ia32_getmantsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \ + (__v8hf)_mm_setzero_ph(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_getmant_sh(W, U, A, B, C, D) \ + ((__m128h)__builtin_ia32_getmantsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \ + (__v8hf)(__m128h)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_getmant_round_sh(W, U, A, B, C, D, R) \ + ((__m128h)__builtin_ia32_getmantsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \ + (__v8hf)(__m128h)(W), (__mmask8)(U), (int)(R))) + +#define _mm_maskz_getmant_sh(U, A, B, C, D) \ + ((__m128h)__builtin_ia32_getmantsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \ + (__v8hf)_mm_setzero_ph(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_maskz_getmant_round_sh(U, A, B, C, D, R) \ + ((__m128h)__builtin_ia32_getmantsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \ + (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R))) + +#define _mm_getexp_round_sh(A, B, R) \ + ((__m128h)__builtin_ia32_getexpsh128_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)-1, (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_getexp_sh(__m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_getexpsh128_round_mask( + (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask_getexp_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + return (__m128h)__builtin_ia32_getexpsh128_round_mask( + (__v8hf)__A, (__v8hf)__B, (__v8hf)__W, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_mask_getexp_round_sh(W, U, A, B, R) \ + ((__m128h)__builtin_ia32_getexpsh128_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_getexp_sh(__mmask8 __U, __m128h __A, __m128h __B) { + return (__m128h)__builtin_ia32_getexpsh128_round_mask( + (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_maskz_getexp_round_sh(U, A, B, R) \ + ((__m128h)__builtin_ia32_getexpsh128_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)(U), (int)(R))) + +#define _mm_scalef_round_sh(A, B, R) \ + ((__m128h)__builtin_ia32_scalefsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)-1, (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_scalef_sh(__m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_scalefsh_round_mask( + (__v8hf)__A, (__v8hf)(__B), (__v8hf)_mm_setzero_ph(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask_scalef_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + return (__m128h)__builtin_ia32_scalefsh_round_mask((__v8hf)__A, (__v8hf)__B, + (__v8hf)__W, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_mask_scalef_round_sh(W, U, A, B, R) \ + ((__m128h)__builtin_ia32_scalefsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_scalef_sh(__mmask8 __U, __m128h __A, __m128h __B) { + return (__m128h)__builtin_ia32_scalefsh_round_mask( + (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_maskz_scalef_round_sh(U, A, B, R) \ + ((__m128h)__builtin_ia32_scalefsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)(U), (int)(R))) + +#define _mm_roundscale_round_sh(A, B, imm, R) \ + ((__m128h)__builtin_ia32_rndscalesh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)-1, (int)(imm), (int)(R))) + +#define _mm_roundscale_sh(A, B, imm) \ + ((__m128h)__builtin_ia32_rndscalesh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)-1, (int)(imm), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_roundscale_sh(W, U, A, B, I) \ + ((__m128h)__builtin_ia32_rndscalesh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ + (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_roundscale_round_sh(W, U, A, B, I, R) \ + ((__m128h)__builtin_ia32_rndscalesh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ + (__mmask8)(U), (int)(I), (int)(R))) + +#define _mm_maskz_roundscale_sh(U, A, B, I) \ + ((__m128h)__builtin_ia32_rndscalesh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_maskz_roundscale_round_sh(U, A, B, I, R) \ + ((__m128h)__builtin_ia32_rndscalesh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)(U), (int)(I), (int)(R))) + +#define _mm_reduce_sh(A, B, C) \ + ((__m128h)__builtin_ia32_reducesh_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)-1, (int)(C), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_mask_reduce_sh(W, U, A, B, C) \ + ((__m128h)__builtin_ia32_reducesh_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ + (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_maskz_reduce_sh(U, A, B, C) \ + ((__m128h)__builtin_ia32_reducesh_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION)) + +#define _mm_reduce_round_sh(A, B, C, R) \ + ((__m128h)__builtin_ia32_reducesh_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)-1, (int)(C), (int)(R))) + +#define _mm_mask_reduce_round_sh(W, U, A, B, C, R) \ + ((__m128h)__builtin_ia32_reducesh_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ + (__mmask8)(U), (int)(C), (int)(R))) + +#define _mm_maskz_reduce_round_sh(U, A, B, C, R) \ + ((__m128h)__builtin_ia32_reducesh_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)(U), (int)(C), (int)(R))) + +#define _mm512_sqrt_round_ph(A, R) \ + ((__m512h)__builtin_ia32_sqrtph512((__v32hf)(__m512h)(A), (int)(R))) + +#define _mm512_mask_sqrt_round_ph(W, U, A, R) \ + ((__m512h)__builtin_ia32_selectph_512( \ + (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)), \ + (__v32hf)(__m512h)(W))) + +#define _mm512_maskz_sqrt_round_ph(U, A, R) \ + ((__m512h)__builtin_ia32_selectph_512( \ + (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)), \ + (__v32hf)_mm512_setzero_ph())) + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_sqrt_ph(__m512h __A) { + return (__m512h)__builtin_ia32_sqrtph512((__v32hf)__A, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_sqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) { + return (__m512h)__builtin_ia32_selectph_512( + (__mmask32)(__U), + (__v32hf)__builtin_ia32_sqrtph512((__A), (_MM_FROUND_CUR_DIRECTION)), + (__v32hf)(__m512h)(__W)); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_maskz_sqrt_ph(__mmask32 __U, __m512h __A) { + return (__m512h)__builtin_ia32_selectph_512( + (__mmask32)(__U), + (__v32hf)__builtin_ia32_sqrtph512((__A), (_MM_FROUND_CUR_DIRECTION)), + (__v32hf)_mm512_setzero_ph()); +} + +#define _mm_sqrt_round_sh(A, B, R) \ + ((__m128h)__builtin_ia32_sqrtsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)-1, (int)(R))) + +#define _mm_mask_sqrt_round_sh(W, U, A, B, R) \ + ((__m128h)__builtin_ia32_sqrtsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm_maskz_sqrt_round_sh(U, A, B, R) \ + ((__m128h)__builtin_ia32_sqrtsh_round_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sqrt_sh(__m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_sqrtsh_round_mask( + (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(), + (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_sh(__m128h __W, + __mmask32 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_sqrtsh_round_mask( + (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)(__m128h)(__W), + (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_sh(__mmask32 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_sqrtsh_round_mask( + (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(), + (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_mask_fpclass_ph_mask(U, A, imm) \ + ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A), \ + (int)(imm), (__mmask32)(U))) + +#define _mm512_fpclass_ph_mask(A, imm) \ + ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A), \ + (int)(imm), (__mmask32)-1)) + +#define _mm_fpclass_sh_mask(A, imm) \ + ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm), \ + (__mmask8)-1)) + +#define _mm_mask_fpclass_sh_mask(U, A, imm) \ + ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm), \ + (__mmask8)(U))) + +#define _mm512_cvt_roundpd_ph(A, R) \ + ((__m128h)__builtin_ia32_vcvtpd2ph512_mask( \ + (__v8df)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R))) + +#define _mm512_mask_cvt_roundpd_ph(W, U, A, R) \ + ((__m128h)__builtin_ia32_vcvtpd2ph512_mask((__v8df)(A), (__v8hf)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm512_maskz_cvt_roundpd_ph(U, A, R) \ + ((__m128h)__builtin_ia32_vcvtpd2ph512_mask( \ + (__v8df)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS512 _mm512_cvtpd_ph(__m512d __A) { + return (__m128h)__builtin_ia32_vcvtpd2ph512_mask( + (__v8df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtpd_ph(__m128h __W, __mmask8 __U, __m512d __A) { + return (__m128h)__builtin_ia32_vcvtpd2ph512_mask( + (__v8df)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtpd_ph(__mmask8 __U, __m512d __A) { + return (__m128h)__builtin_ia32_vcvtpd2ph512_mask( + (__v8df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_cvt_roundph_pd(A, R) \ + ((__m512d)__builtin_ia32_vcvtph2pd512_mask( \ + (__v8hf)(A), (__v8df)_mm512_undefined_pd(), (__mmask8)(-1), (int)(R))) + +#define _mm512_mask_cvt_roundph_pd(W, U, A, R) \ + ((__m512d)__builtin_ia32_vcvtph2pd512_mask((__v8hf)(A), (__v8df)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm512_maskz_cvt_roundph_pd(U, A, R) \ + ((__m512d)__builtin_ia32_vcvtph2pd512_mask( \ + (__v8hf)(A), (__v8df)_mm512_setzero_pd(), (__mmask8)(U), (int)(R))) + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_cvtph_pd(__m128h __A) { + return (__m512d)__builtin_ia32_vcvtph2pd512_mask( + (__v8hf)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtph_pd(__m512d __W, __mmask8 __U, __m128h __A) { + return (__m512d)__builtin_ia32_vcvtph2pd512_mask( + (__v8hf)__A, (__v8df)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512d __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtph_pd(__mmask8 __U, __m128h __A) { + return (__m512d)__builtin_ia32_vcvtph2pd512_mask( + (__v8hf)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_cvt_roundsh_ss(A, B, R) \ + ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B), \ + (__v4sf)_mm_undefined_ps(), \ + (__mmask8)(-1), (int)(R))) + +#define _mm_mask_cvt_roundsh_ss(W, U, A, B, R) \ + ((__m128)__builtin_ia32_vcvtsh2ss_round_mask( \ + (__v4sf)(A), (__v8hf)(B), (__v4sf)(W), (__mmask8)(U), (int)(R))) + +#define _mm_maskz_cvt_roundsh_ss(U, A, B, R) \ + ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtsh_ss(__m128 __A, + __m128h __B) { + return (__m128)__builtin_ia32_vcvtsh2ss_round_mask( + (__v4sf)__A, (__v8hf)__B, (__v4sf)_mm_undefined_ps(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_cvtsh_ss(__m128 __W, + __mmask8 __U, + __m128 __A, + __m128h __B) { + return (__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)__A, (__v8hf)__B, + (__v4sf)__W, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_cvtsh_ss(__mmask8 __U, + __m128 __A, + __m128h __B) { + return (__m128)__builtin_ia32_vcvtsh2ss_round_mask( + (__v4sf)__A, (__v8hf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_cvt_roundss_sh(A, B, R) \ + ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B), \ + (__v8hf)_mm_undefined_ph(), \ + (__mmask8)(-1), (int)(R))) + +#define _mm_mask_cvt_roundss_sh(W, U, A, B, R) \ + ((__m128h)__builtin_ia32_vcvtss2sh_round_mask( \ + (__v8hf)(A), (__v4sf)(B), (__v8hf)(W), (__mmask8)(U), (int)(R))) + +#define _mm_maskz_cvt_roundss_sh(U, A, B, R) \ + ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B), \ + (__v8hf)_mm_setzero_ph(), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtss_sh(__m128h __A, + __m128 __B) { + return (__m128h)__builtin_ia32_vcvtss2sh_round_mask( + (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_undefined_ph(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtss_sh(__m128h __W, + __mmask8 __U, + __m128h __A, + __m128 __B) { + return (__m128h)__builtin_ia32_vcvtss2sh_round_mask( + (__v8hf)__A, (__v4sf)__B, (__v8hf)__W, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_cvtss_sh(__mmask8 __U, + __m128h __A, + __m128 __B) { + return (__m128h)__builtin_ia32_vcvtss2sh_round_mask( + (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_cvt_roundsd_sh(A, B, R) \ + ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B), \ + (__v8hf)_mm_undefined_ph(), \ + (__mmask8)(-1), (int)(R))) + +#define _mm_mask_cvt_roundsd_sh(W, U, A, B, R) \ + ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask( \ + (__v8hf)(A), (__v2df)(B), (__v8hf)(W), (__mmask8)(U), (int)(R))) + +#define _mm_maskz_cvt_roundsd_sh(U, A, B, R) \ + ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B), \ + (__v8hf)_mm_setzero_ph(), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtsd_sh(__m128h __A, + __m128d __B) { + return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask( + (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_undefined_ph(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtsd_sh(__m128h __W, + __mmask8 __U, + __m128h __A, + __m128d __B) { + return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask( + (__v8hf)__A, (__v2df)__B, (__v8hf)__W, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtsd_sh(__mmask8 __U, __m128h __A, __m128d __B) { + return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask( + (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_cvt_roundsh_sd(A, B, R) \ + ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B), \ + (__v2df)_mm_undefined_pd(), \ + (__mmask8)(-1), (int)(R))) + +#define _mm_mask_cvt_roundsh_sd(W, U, A, B, R) \ + ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask( \ + (__v2df)(A), (__v8hf)(B), (__v2df)(W), (__mmask8)(U), (int)(R))) + +#define _mm_maskz_cvt_roundsh_sd(U, A, B, R) \ + ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_cvtsh_sd(__m128d __A, + __m128h __B) { + return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask( + (__v2df)__A, (__v8hf)__B, (__v2df)_mm_undefined_pd(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_cvtsh_sd(__m128d __W, + __mmask8 __U, + __m128d __A, + __m128h __B) { + return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask( + (__v2df)__A, (__v8hf)__B, (__v2df)__W, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtsh_sd(__mmask8 __U, __m128d __A, __m128h __B) { + return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask( + (__v2df)__A, (__v8hf)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_cvt_roundph_epi16(A, R) \ + ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), \ + (__v32hi)_mm512_undefined_epi32(), \ + (__mmask32)(-1), (int)(R))) + +#define _mm512_mask_cvt_roundph_epi16(W, U, A, R) \ + ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), (__v32hi)(W), \ + (__mmask32)(U), (int)(R))) + +#define _mm512_maskz_cvt_roundph_epi16(U, A, R) \ + ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), \ + (__v32hi)_mm512_setzero_epi32(), \ + (__mmask32)(U), (int)(R))) + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_cvtph_epi16(__m512h __A) { + return (__m512i)__builtin_ia32_vcvtph2w512_mask( + (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtph_epi16(__m512i __W, __mmask32 __U, __m512h __A) { + return (__m512i)__builtin_ia32_vcvtph2w512_mask( + (__v32hf)__A, (__v32hi)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtph_epi16(__mmask32 __U, __m512h __A) { + return (__m512i)__builtin_ia32_vcvtph2w512_mask( + (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_cvtt_roundph_epi16(A, R) \ + ((__m512i)__builtin_ia32_vcvttph2w512_mask( \ + (__v32hf)(A), (__v32hi)_mm512_undefined_epi32(), (__mmask32)(-1), \ + (int)(R))) + +#define _mm512_mask_cvtt_roundph_epi16(W, U, A, R) \ + ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A), (__v32hi)(W), \ + (__mmask32)(U), (int)(R))) + +#define _mm512_maskz_cvtt_roundph_epi16(U, A, R) \ + ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A), \ + (__v32hi)_mm512_setzero_epi32(), \ + (__mmask32)(U), (int)(R))) + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_cvttph_epi16(__m512h __A) { + return (__m512i)__builtin_ia32_vcvttph2w512_mask( + (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvttph_epi16(__m512i __W, __mmask32 __U, __m512h __A) { + return (__m512i)__builtin_ia32_vcvttph2w512_mask( + (__v32hf)__A, (__v32hi)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvttph_epi16(__mmask32 __U, __m512h __A) { + return (__m512i)__builtin_ia32_vcvttph2w512_mask( + (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_cvt_roundepi16_ph(A, R) \ + ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A), \ + (__v32hf)_mm512_undefined_ph(), \ + (__mmask32)(-1), (int)(R))) + +#define _mm512_mask_cvt_roundepi16_ph(W, U, A, R) \ + ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A), (__v32hf)(W), \ + (__mmask32)(U), (int)(R))) + +#define _mm512_maskz_cvt_roundepi16_ph(U, A, R) \ + ((__m512h)__builtin_ia32_vcvtw2ph512_mask( \ + (__v32hi)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R))) + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_cvtepi16_ph(__m512i __A) { + return (__m512h)__builtin_ia32_vcvtw2ph512_mask( + (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtepi16_ph(__m512h __W, __mmask32 __U, __m512i __A) { + return (__m512h)__builtin_ia32_vcvtw2ph512_mask( + (__v32hi)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtepi16_ph(__mmask32 __U, __m512i __A) { + return (__m512h)__builtin_ia32_vcvtw2ph512_mask( + (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_cvt_roundph_epu16(A, R) \ + ((__m512i)__builtin_ia32_vcvtph2uw512_mask( \ + (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1), \ + (int)(R))) + +#define _mm512_mask_cvt_roundph_epu16(W, U, A, R) \ + ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A), (__v32hu)(W), \ + (__mmask32)(U), (int)(R))) + +#define _mm512_maskz_cvt_roundph_epu16(U, A, R) \ + ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A), \ + (__v32hu)_mm512_setzero_epi32(), \ + (__mmask32)(U), (int)(R))) + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_cvtph_epu16(__m512h __A) { + return (__m512i)__builtin_ia32_vcvtph2uw512_mask( + (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtph_epu16(__m512i __W, __mmask32 __U, __m512h __A) { + return (__m512i)__builtin_ia32_vcvtph2uw512_mask( + (__v32hf)__A, (__v32hu)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtph_epu16(__mmask32 __U, __m512h __A) { + return (__m512i)__builtin_ia32_vcvtph2uw512_mask( + (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_cvtt_roundph_epu16(A, R) \ + ((__m512i)__builtin_ia32_vcvttph2uw512_mask( \ + (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1), \ + (int)(R))) + +#define _mm512_mask_cvtt_roundph_epu16(W, U, A, R) \ + ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A), (__v32hu)(W), \ + (__mmask32)(U), (int)(R))) + +#define _mm512_maskz_cvtt_roundph_epu16(U, A, R) \ + ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A), \ + (__v32hu)_mm512_setzero_epi32(), \ + (__mmask32)(U), (int)(R))) + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_cvttph_epu16(__m512h __A) { + return (__m512i)__builtin_ia32_vcvttph2uw512_mask( + (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvttph_epu16(__m512i __W, __mmask32 __U, __m512h __A) { + return (__m512i)__builtin_ia32_vcvttph2uw512_mask( + (__v32hf)__A, (__v32hu)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvttph_epu16(__mmask32 __U, __m512h __A) { + return (__m512i)__builtin_ia32_vcvttph2uw512_mask( + (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_cvt_roundepu16_ph(A, R) \ + ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A), \ + (__v32hf)_mm512_undefined_ph(), \ + (__mmask32)(-1), (int)(R))) + +#define _mm512_mask_cvt_roundepu16_ph(W, U, A, R) \ + ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A), (__v32hf)(W), \ + (__mmask32)(U), (int)(R))) + +#define _mm512_maskz_cvt_roundepu16_ph(U, A, R) \ + ((__m512h)__builtin_ia32_vcvtuw2ph512_mask( \ + (__v32hu)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R))) + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_cvtepu16_ph(__m512i __A) { + return (__m512h)__builtin_ia32_vcvtuw2ph512_mask( + (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtepu16_ph(__m512h __W, __mmask32 __U, __m512i __A) { + return (__m512h)__builtin_ia32_vcvtuw2ph512_mask( + (__v32hu)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtepu16_ph(__mmask32 __U, __m512i __A) { + return (__m512h)__builtin_ia32_vcvtuw2ph512_mask( + (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_cvt_roundph_epi32(A, R) \ + ((__m512i)__builtin_ia32_vcvtph2dq512_mask( \ + (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1), \ + (int)(R))) + +#define _mm512_mask_cvt_roundph_epi32(W, U, A, R) \ + ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A), (__v16si)(W), \ + (__mmask16)(U), (int)(R))) + +#define _mm512_maskz_cvt_roundph_epi32(U, A, R) \ + ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A), \ + (__v16si)_mm512_setzero_epi32(), \ + (__mmask16)(U), (int)(R))) + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_cvtph_epi32(__m256h __A) { + return (__m512i)__builtin_ia32_vcvtph2dq512_mask( + (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtph_epi32(__m512i __W, __mmask16 __U, __m256h __A) { + return (__m512i)__builtin_ia32_vcvtph2dq512_mask( + (__v16hf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtph_epi32(__mmask16 __U, __m256h __A) { + return (__m512i)__builtin_ia32_vcvtph2dq512_mask( + (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_cvt_roundph_epu32(A, R) \ + ((__m512i)__builtin_ia32_vcvtph2udq512_mask( \ + (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1), \ + (int)(R))) + +#define _mm512_mask_cvt_roundph_epu32(W, U, A, R) \ + ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A), (__v16su)(W), \ + (__mmask16)(U), (int)(R))) + +#define _mm512_maskz_cvt_roundph_epu32(U, A, R) \ + ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A), \ + (__v16su)_mm512_setzero_epi32(), \ + (__mmask16)(U), (int)(R))) + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_cvtph_epu32(__m256h __A) { + return (__m512i)__builtin_ia32_vcvtph2udq512_mask( + (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtph_epu32(__m512i __W, __mmask16 __U, __m256h __A) { + return (__m512i)__builtin_ia32_vcvtph2udq512_mask( + (__v16hf)__A, (__v16su)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtph_epu32(__mmask16 __U, __m256h __A) { + return (__m512i)__builtin_ia32_vcvtph2udq512_mask( + (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_cvt_roundepi32_ph(A, R) \ + ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A), \ + (__v16hf)_mm256_undefined_ph(), \ + (__mmask16)(-1), (int)(R))) + +#define _mm512_mask_cvt_roundepi32_ph(W, U, A, R) \ + ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A), (__v16hf)(W), \ + (__mmask16)(U), (int)(R))) + +#define _mm512_maskz_cvt_roundepi32_ph(U, A, R) \ + ((__m256h)__builtin_ia32_vcvtdq2ph512_mask( \ + (__v16si)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R))) + +static __inline__ __m256h __DEFAULT_FN_ATTRS512 +_mm512_cvtepi32_ph(__m512i __A) { + return (__m256h)__builtin_ia32_vcvtdq2ph512_mask( + (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtepi32_ph(__m256h __W, __mmask16 __U, __m512i __A) { + return (__m256h)__builtin_ia32_vcvtdq2ph512_mask( + (__v16si)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtepi32_ph(__mmask16 __U, __m512i __A) { + return (__m256h)__builtin_ia32_vcvtdq2ph512_mask( + (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_cvt_roundepu32_ph(A, R) \ + ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A), \ + (__v16hf)_mm256_undefined_ph(), \ + (__mmask16)(-1), (int)(R))) + +#define _mm512_mask_cvt_roundepu32_ph(W, U, A, R) \ + ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A), (__v16hf)(W), \ + (__mmask16)(U), (int)(R))) + +#define _mm512_maskz_cvt_roundepu32_ph(U, A, R) \ + ((__m256h)__builtin_ia32_vcvtudq2ph512_mask( \ + (__v16su)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R))) + +static __inline__ __m256h __DEFAULT_FN_ATTRS512 +_mm512_cvtepu32_ph(__m512i __A) { + return (__m256h)__builtin_ia32_vcvtudq2ph512_mask( + (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtepu32_ph(__m256h __W, __mmask16 __U, __m512i __A) { + return (__m256h)__builtin_ia32_vcvtudq2ph512_mask( + (__v16su)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtepu32_ph(__mmask16 __U, __m512i __A) { + return (__m256h)__builtin_ia32_vcvtudq2ph512_mask( + (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_cvtt_roundph_epi32(A, R) \ + ((__m512i)__builtin_ia32_vcvttph2dq512_mask( \ + (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1), \ + (int)(R))) + +#define _mm512_mask_cvtt_roundph_epi32(W, U, A, R) \ + ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A), (__v16si)(W), \ + (__mmask16)(U), (int)(R))) + +#define _mm512_maskz_cvtt_roundph_epi32(U, A, R) \ + ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A), \ + (__v16si)_mm512_setzero_epi32(), \ + (__mmask16)(U), (int)(R))) + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_cvttph_epi32(__m256h __A) { + return (__m512i)__builtin_ia32_vcvttph2dq512_mask( + (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvttph_epi32(__m512i __W, __mmask16 __U, __m256h __A) { + return (__m512i)__builtin_ia32_vcvttph2dq512_mask( + (__v16hf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvttph_epi32(__mmask16 __U, __m256h __A) { + return (__m512i)__builtin_ia32_vcvttph2dq512_mask( + (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_cvtt_roundph_epu32(A, R) \ + ((__m512i)__builtin_ia32_vcvttph2udq512_mask( \ + (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1), \ + (int)(R))) + +#define _mm512_mask_cvtt_roundph_epu32(W, U, A, R) \ + ((__m512i)__builtin_ia32_vcvttph2udq512_mask((__v16hf)(A), (__v16su)(W), \ + (__mmask16)(U), (int)(R))) + +#define _mm512_maskz_cvtt_roundph_epu32(U, A, R) \ + ((__m512i)__builtin_ia32_vcvttph2udq512_mask( \ + (__v16hf)(A), (__v16su)_mm512_setzero_epi32(), (__mmask16)(U), \ + (int)(R))) + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_cvttph_epu32(__m256h __A) { + return (__m512i)__builtin_ia32_vcvttph2udq512_mask( + (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvttph_epu32(__m512i __W, __mmask16 __U, __m256h __A) { + return (__m512i)__builtin_ia32_vcvttph2udq512_mask( + (__v16hf)__A, (__v16su)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvttph_epu32(__mmask16 __U, __m256h __A) { + return (__m512i)__builtin_ia32_vcvttph2udq512_mask( + (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_cvt_roundepi64_ph(A, R) \ + ((__m128h)__builtin_ia32_vcvtqq2ph512_mask( \ + (__v8di)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R))) + +#define _mm512_mask_cvt_roundepi64_ph(W, U, A, R) \ + ((__m128h)__builtin_ia32_vcvtqq2ph512_mask((__v8di)(A), (__v8hf)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm512_maskz_cvt_roundepi64_ph(U, A, R) \ + ((__m128h)__builtin_ia32_vcvtqq2ph512_mask( \ + (__v8di)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS512 +_mm512_cvtepi64_ph(__m512i __A) { + return (__m128h)__builtin_ia32_vcvtqq2ph512_mask( + (__v8di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtepi64_ph(__m128h __W, __mmask8 __U, __m512i __A) { + return (__m128h)__builtin_ia32_vcvtqq2ph512_mask( + (__v8di)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtepi64_ph(__mmask8 __U, __m512i __A) { + return (__m128h)__builtin_ia32_vcvtqq2ph512_mask( + (__v8di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_cvt_roundph_epi64(A, R) \ + ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A), \ + (__v8di)_mm512_undefined_epi32(), \ + (__mmask8)(-1), (int)(R))) + +#define _mm512_mask_cvt_roundph_epi64(W, U, A, R) \ + ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A), (__v8di)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm512_maskz_cvt_roundph_epi64(U, A, R) \ + ((__m512i)__builtin_ia32_vcvtph2qq512_mask( \ + (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R))) + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_cvtph_epi64(__m128h __A) { + return (__m512i)__builtin_ia32_vcvtph2qq512_mask( + (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtph_epi64(__m512i __W, __mmask8 __U, __m128h __A) { + return (__m512i)__builtin_ia32_vcvtph2qq512_mask( + (__v8hf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtph_epi64(__mmask8 __U, __m128h __A) { + return (__m512i)__builtin_ia32_vcvtph2qq512_mask( + (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_cvt_roundepu64_ph(A, R) \ + ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask( \ + (__v8du)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R))) + +#define _mm512_mask_cvt_roundepu64_ph(W, U, A, R) \ + ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask((__v8du)(A), (__v8hf)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm512_maskz_cvt_roundepu64_ph(U, A, R) \ + ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask( \ + (__v8du)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS512 +_mm512_cvtepu64_ph(__m512i __A) { + return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask( + (__v8du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtepu64_ph(__m128h __W, __mmask8 __U, __m512i __A) { + return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask( + (__v8du)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtepu64_ph(__mmask8 __U, __m512i __A) { + return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask( + (__v8du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_cvt_roundph_epu64(A, R) \ + ((__m512i)__builtin_ia32_vcvtph2uqq512_mask( \ + (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1), \ + (int)(R))) + +#define _mm512_mask_cvt_roundph_epu64(W, U, A, R) \ + ((__m512i)__builtin_ia32_vcvtph2uqq512_mask((__v8hf)(A), (__v8du)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm512_maskz_cvt_roundph_epu64(U, A, R) \ + ((__m512i)__builtin_ia32_vcvtph2uqq512_mask( \ + (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R))) + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_cvtph_epu64(__m128h __A) { + return (__m512i)__builtin_ia32_vcvtph2uqq512_mask( + (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtph_epu64(__m512i __W, __mmask8 __U, __m128h __A) { + return (__m512i)__builtin_ia32_vcvtph2uqq512_mask( + (__v8hf)__A, (__v8du)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtph_epu64(__mmask8 __U, __m128h __A) { + return (__m512i)__builtin_ia32_vcvtph2uqq512_mask( + (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_cvtt_roundph_epi64(A, R) \ + ((__m512i)__builtin_ia32_vcvttph2qq512_mask( \ + (__v8hf)(A), (__v8di)_mm512_undefined_epi32(), (__mmask8)(-1), \ + (int)(R))) + +#define _mm512_mask_cvtt_roundph_epi64(W, U, A, R) \ + ((__m512i)__builtin_ia32_vcvttph2qq512_mask((__v8hf)(A), (__v8di)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm512_maskz_cvtt_roundph_epi64(U, A, R) \ + ((__m512i)__builtin_ia32_vcvttph2qq512_mask( \ + (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R))) + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_cvttph_epi64(__m128h __A) { + return (__m512i)__builtin_ia32_vcvttph2qq512_mask( + (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvttph_epi64(__m512i __W, __mmask8 __U, __m128h __A) { + return (__m512i)__builtin_ia32_vcvttph2qq512_mask( + (__v8hf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvttph_epi64(__mmask8 __U, __m128h __A) { + return (__m512i)__builtin_ia32_vcvttph2qq512_mask( + (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_cvtt_roundph_epu64(A, R) \ + ((__m512i)__builtin_ia32_vcvttph2uqq512_mask( \ + (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1), \ + (int)(R))) + +#define _mm512_mask_cvtt_roundph_epu64(W, U, A, R) \ + ((__m512i)__builtin_ia32_vcvttph2uqq512_mask((__v8hf)(A), (__v8du)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm512_maskz_cvtt_roundph_epu64(U, A, R) \ + ((__m512i)__builtin_ia32_vcvttph2uqq512_mask( \ + (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R))) + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_cvttph_epu64(__m128h __A) { + return (__m512i)__builtin_ia32_vcvttph2uqq512_mask( + (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_mask_cvttph_epu64(__m512i __W, __mmask8 __U, __m128h __A) { + return (__m512i)__builtin_ia32_vcvttph2uqq512_mask( + (__v8hf)__A, (__v8du)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvttph_epu64(__mmask8 __U, __m128h __A) { + return (__m512i)__builtin_ia32_vcvttph2uqq512_mask( + (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_cvt_roundsh_i32(A, R) \ + ((int)__builtin_ia32_vcvtsh2si32((__v8hf)(A), (int)(R))) + +static __inline__ int __DEFAULT_FN_ATTRS128 _mm_cvtsh_i32(__m128h __A) { + return (int)__builtin_ia32_vcvtsh2si32((__v8hf)__A, _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_cvt_roundsh_u32(A, R) \ + ((unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)(A), (int)(R))) + +static __inline__ unsigned int __DEFAULT_FN_ATTRS128 +_mm_cvtsh_u32(__m128h __A) { + return (unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)__A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __x86_64__ +#define _mm_cvt_roundsh_i64(A, R) \ + ((long long)__builtin_ia32_vcvtsh2si64((__v8hf)(A), (int)(R))) + +static __inline__ long long __DEFAULT_FN_ATTRS128 _mm_cvtsh_i64(__m128h __A) { + return (long long)__builtin_ia32_vcvtsh2si64((__v8hf)__A, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_cvt_roundsh_u64(A, R) \ + ((unsigned long long)__builtin_ia32_vcvtsh2usi64((__v8hf)(A), (int)(R))) + +static __inline__ unsigned long long __DEFAULT_FN_ATTRS128 +_mm_cvtsh_u64(__m128h __A) { + return (unsigned long long)__builtin_ia32_vcvtsh2usi64( + (__v8hf)__A, _MM_FROUND_CUR_DIRECTION); +} +#endif // __x86_64__ + +#define _mm_cvt_roundu32_sh(A, B, R) \ + ((__m128h)__builtin_ia32_vcvtusi2sh((__v8hf)(A), (unsigned int)(B), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_cvtu32_sh(__m128h __A, unsigned int __B) { + __A[0] = __B; + return __A; +} + +#ifdef __x86_64__ +#define _mm_cvt_roundu64_sh(A, B, R) \ + ((__m128h)__builtin_ia32_vcvtusi642sh((__v8hf)(A), (unsigned long long)(B), \ + (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_cvtu64_sh(__m128h __A, unsigned long long __B) { + __A[0] = __B; + return __A; +} +#endif + +#define _mm_cvt_roundi32_sh(A, B, R) \ + ((__m128h)__builtin_ia32_vcvtsi2sh((__v8hf)(A), (int)(B), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvti32_sh(__m128h __A, + int __B) { + __A[0] = __B; + return __A; +} + +#ifdef __x86_64__ +#define _mm_cvt_roundi64_sh(A, B, R) \ + ((__m128h)__builtin_ia32_vcvtsi642sh((__v8hf)(A), (long long)(B), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvti64_sh(__m128h __A, + long long __B) { + __A[0] = __B; + return __A; +} +#endif + +#define _mm_cvtt_roundsh_i32(A, R) \ + ((int)__builtin_ia32_vcvttsh2si32((__v8hf)(A), (int)(R))) + +static __inline__ int __DEFAULT_FN_ATTRS128 _mm_cvttsh_i32(__m128h __A) { + return (int)__builtin_ia32_vcvttsh2si32((__v8hf)__A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __x86_64__ +#define _mm_cvtt_roundsh_i64(A, R) \ + ((long long)__builtin_ia32_vcvttsh2si64((__v8hf)(A), (int)(R))) + +static __inline__ long long __DEFAULT_FN_ATTRS128 _mm_cvttsh_i64(__m128h __A) { + return (long long)__builtin_ia32_vcvttsh2si64((__v8hf)__A, + _MM_FROUND_CUR_DIRECTION); +} +#endif + +#define _mm_cvtt_roundsh_u32(A, R) \ + ((unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)(A), (int)(R))) + +static __inline__ unsigned int __DEFAULT_FN_ATTRS128 +_mm_cvttsh_u32(__m128h __A) { + return (unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)__A, + _MM_FROUND_CUR_DIRECTION); +} + +#ifdef __x86_64__ +#define _mm_cvtt_roundsh_u64(A, R) \ + ((unsigned long long)__builtin_ia32_vcvttsh2usi64((__v8hf)(A), (int)(R))) + +static __inline__ unsigned long long __DEFAULT_FN_ATTRS128 +_mm_cvttsh_u64(__m128h __A) { + return (unsigned long long)__builtin_ia32_vcvttsh2usi64( + (__v8hf)__A, _MM_FROUND_CUR_DIRECTION); +} +#endif + +#define _mm512_cvtx_roundph_ps(A, R) \ + ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A), \ + (__v16sf)_mm512_undefined_ps(), \ + (__mmask16)(-1), (int)(R))) + +#define _mm512_mask_cvtx_roundph_ps(W, U, A, R) \ + ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A), (__v16sf)(W), \ + (__mmask16)(U), (int)(R))) + +#define _mm512_maskz_cvtx_roundph_ps(U, A, R) \ + ((__m512)__builtin_ia32_vcvtph2psx512_mask( \ + (__v16hf)(A), (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), (int)(R))) + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtxph_ps(__m256h __A) { + return (__m512)__builtin_ia32_vcvtph2psx512_mask( + (__v16hf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtxph_ps(__m512 __W, __mmask16 __U, __m256h __A) { + return (__m512)__builtin_ia32_vcvtph2psx512_mask( + (__v16hf)__A, (__v16sf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512 __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtxph_ps(__mmask16 __U, __m256h __A) { + return (__m512)__builtin_ia32_vcvtph2psx512_mask( + (__v16hf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_cvtx_roundps_ph(A, R) \ + ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A), \ + (__v16hf)_mm256_undefined_ph(), \ + (__mmask16)(-1), (int)(R))) + +#define _mm512_mask_cvtx_roundps_ph(W, U, A, R) \ + ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A), (__v16hf)(W), \ + (__mmask16)(U), (int)(R))) + +#define _mm512_maskz_cvtx_roundps_ph(U, A, R) \ + ((__m256h)__builtin_ia32_vcvtps2phx512_mask( \ + (__v16sf)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R))) + +static __inline__ __m256h __DEFAULT_FN_ATTRS512 _mm512_cvtxps_ph(__m512 __A) { + return (__m256h)__builtin_ia32_vcvtps2phx512_mask( + (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS512 +_mm512_mask_cvtxps_ph(__m256h __W, __mmask16 __U, __m512 __A) { + return (__m256h)__builtin_ia32_vcvtps2phx512_mask( + (__v16sf)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS512 +_mm512_maskz_cvtxps_ph(__mmask16 __U, __m512 __A) { + return (__m256h)__builtin_ia32_vcvtps2phx512_mask( + (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_fmadd_round_ph(A, B, C, R) \ + ((__m512h)__builtin_ia32_vfmaddph512_mask( \ + (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ + (__mmask32)-1, (int)(R))) + +#define _mm512_mask_fmadd_round_ph(A, U, B, C, R) \ + ((__m512h)__builtin_ia32_vfmaddph512_mask( \ + (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ + (__mmask32)(U), (int)(R))) + +#define _mm512_mask3_fmadd_round_ph(A, B, C, U, R) \ + ((__m512h)__builtin_ia32_vfmaddph512_mask3( \ + (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ + (__mmask32)(U), (int)(R))) + +#define _mm512_maskz_fmadd_round_ph(U, A, B, C, R) \ + ((__m512h)__builtin_ia32_vfmaddph512_maskz( \ + (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ + (__mmask32)(U), (int)(R))) + +#define _mm512_fmsub_round_ph(A, B, C, R) \ + ((__m512h)__builtin_ia32_vfmaddph512_mask( \ + (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \ + (__mmask32)-1, (int)(R))) + +#define _mm512_mask_fmsub_round_ph(A, U, B, C, R) \ + ((__m512h)__builtin_ia32_vfmaddph512_mask( \ + (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \ + (__mmask32)(U), (int)(R))) + +#define _mm512_maskz_fmsub_round_ph(U, A, B, C, R) \ + ((__m512h)__builtin_ia32_vfmaddph512_maskz( \ + (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \ + (__mmask32)(U), (int)(R))) + +#define _mm512_fnmadd_round_ph(A, B, C, R) \ + ((__m512h)__builtin_ia32_vfmaddph512_mask( \ + (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ + (__mmask32)-1, (int)(R))) + +#define _mm512_mask3_fnmadd_round_ph(A, B, C, U, R) \ + ((__m512h)__builtin_ia32_vfmaddph512_mask3( \ + -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ + (__mmask32)(U), (int)(R))) + +#define _mm512_maskz_fnmadd_round_ph(U, A, B, C, R) \ + ((__m512h)__builtin_ia32_vfmaddph512_maskz( \ + -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ + (__mmask32)(U), (int)(R))) + +#define _mm512_fnmsub_round_ph(A, B, C, R) \ + ((__m512h)__builtin_ia32_vfmaddph512_mask( \ + (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \ + (__mmask32)-1, (int)(R))) + +#define _mm512_maskz_fnmsub_round_ph(U, A, B, C, R) \ + ((__m512h)__builtin_ia32_vfmaddph512_maskz( \ + -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \ + (__mmask32)(U), (int)(R))) + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmadd_ph(__m512h __A, + __m512h __B, + __m512h __C) { + return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B, + (__v32hf)__C, (__mmask32)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_fmadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) { + return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B, + (__v32hf)__C, (__mmask32)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask3_fmadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) { + return (__m512h)__builtin_ia32_vfmaddph512_mask3((__v32hf)__A, (__v32hf)__B, + (__v32hf)__C, (__mmask32)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_maskz_fmadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) { + return (__m512h)__builtin_ia32_vfmaddph512_maskz((__v32hf)__A, (__v32hf)__B, + (__v32hf)__C, (__mmask32)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmsub_ph(__m512h __A, + __m512h __B, + __m512h __C) { + return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B, + -(__v32hf)__C, (__mmask32)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_fmsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) { + return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B, + -(__v32hf)__C, (__mmask32)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_maskz_fmsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) { + return (__m512h)__builtin_ia32_vfmaddph512_maskz( + (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fnmadd_ph(__m512h __A, + __m512h __B, + __m512h __C) { + return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B, + (__v32hf)__C, (__mmask32)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask3_fnmadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) { + return (__m512h)__builtin_ia32_vfmaddph512_mask3(-(__v32hf)__A, (__v32hf)__B, + (__v32hf)__C, (__mmask32)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_maskz_fnmadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) { + return (__m512h)__builtin_ia32_vfmaddph512_maskz(-(__v32hf)__A, (__v32hf)__B, + (__v32hf)__C, (__mmask32)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fnmsub_ph(__m512h __A, + __m512h __B, + __m512h __C) { + return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B, + -(__v32hf)__C, (__mmask32)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_maskz_fnmsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) { + return (__m512h)__builtin_ia32_vfmaddph512_maskz( + -(__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_fmaddsub_round_ph(A, B, C, R) \ + ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \ + (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ + (__mmask32)-1, (int)(R))) + +#define _mm512_mask_fmaddsub_round_ph(A, U, B, C, R) \ + ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \ + (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ + (__mmask32)(U), (int)(R))) + +#define _mm512_mask3_fmaddsub_round_ph(A, B, C, U, R) \ + ((__m512h)__builtin_ia32_vfmaddsubph512_mask3( \ + (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ + (__mmask32)(U), (int)(R))) + +#define _mm512_maskz_fmaddsub_round_ph(U, A, B, C, R) \ + ((__m512h)__builtin_ia32_vfmaddsubph512_maskz( \ + (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ + (__mmask32)(U), (int)(R))) + +#define _mm512_fmsubadd_round_ph(A, B, C, R) \ + ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \ + (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \ + (__mmask32)-1, (int)(R))) + +#define _mm512_mask_fmsubadd_round_ph(A, U, B, C, R) \ + ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \ + (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \ + (__mmask32)(U), (int)(R))) + +#define _mm512_maskz_fmsubadd_round_ph(U, A, B, C, R) \ + ((__m512h)__builtin_ia32_vfmaddsubph512_maskz( \ + (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \ + (__mmask32)(U), (int)(R))) + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C) { + return (__m512h)__builtin_ia32_vfmaddsubph512_mask( + (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_fmaddsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) { + return (__m512h)__builtin_ia32_vfmaddsubph512_mask( + (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask3_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) { + return (__m512h)__builtin_ia32_vfmaddsubph512_mask3( + (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_maskz_fmaddsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) { + return (__m512h)__builtin_ia32_vfmaddsubph512_maskz( + (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C) { + return (__m512h)__builtin_ia32_vfmaddsubph512_mask( + (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_fmsubadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) { + return (__m512h)__builtin_ia32_vfmaddsubph512_mask( + (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_maskz_fmsubadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) { + return (__m512h)__builtin_ia32_vfmaddsubph512_maskz( + (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_mask3_fmsub_round_ph(A, B, C, U, R) \ + ((__m512h)__builtin_ia32_vfmsubph512_mask3( \ + (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ + (__mmask32)(U), (int)(R))) + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask3_fmsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) { + return (__m512h)__builtin_ia32_vfmsubph512_mask3((__v32hf)__A, (__v32hf)__B, + (__v32hf)__C, (__mmask32)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_mask3_fmsubadd_round_ph(A, B, C, U, R) \ + ((__m512h)__builtin_ia32_vfmsubaddph512_mask3( \ + (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ + (__mmask32)(U), (int)(R))) + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask3_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) { + return (__m512h)__builtin_ia32_vfmsubaddph512_mask3( + (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_mask_fnmadd_round_ph(A, U, B, C, R) \ + ((__m512h)__builtin_ia32_vfmaddph512_mask( \ + (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ + (__mmask32)(U), (int)(R))) + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_fnmadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) { + return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B, + (__v32hf)__C, (__mmask32)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_mask_fnmsub_round_ph(A, U, B, C, R) \ + ((__m512h)__builtin_ia32_vfmaddph512_mask( \ + (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \ + (__mmask32)(U), (int)(R))) + +#define _mm512_mask3_fnmsub_round_ph(A, B, C, U, R) \ + ((__m512h)__builtin_ia32_vfmsubph512_mask3( \ + -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ + (__mmask32)(U), (int)(R))) + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_fnmsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) { + return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B, + -(__v32hf)__C, (__mmask32)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask3_fnmsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) { + return (__m512h)__builtin_ia32_vfmsubph512_mask3(-(__v32hf)__A, (__v32hf)__B, + (__v32hf)__C, (__mmask32)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_sh(__m128h __W, + __m128h __A, + __m128h __B) { + return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B, + (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmadd_sh(__m128h __W, + __mmask8 __U, + __m128h __A, + __m128h __B) { + return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B, + (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_fmadd_round_sh(A, B, C, R) \ + ((__m128h)__builtin_ia32_vfmaddsh3_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \ + (__mmask8)-1, (int)(R))) + +#define _mm_mask_fmadd_round_sh(W, U, A, B, R) \ + ((__m128h)__builtin_ia32_vfmaddsh3_mask( \ + (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_fmadd_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { + return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B, (__v8hf)__C, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_maskz_fmadd_round_sh(U, A, B, C, R) \ + ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask3_fmadd_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) { + return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)__Y, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_mask3_fmadd_round_sh(W, X, Y, U, R) \ + ((__m128h)__builtin_ia32_vfmaddsh3_mask3( \ + (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsub_sh(__m128h __W, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, + -(__v8hf)__B, (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmsub_sh(__m128h __W, + __mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, + -(__v8hf)__B, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_fmsub_round_sh(A, B, C, R) \ + ((__m128h)__builtin_ia32_vfmaddsh3_mask( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \ + (__mmask8)-1, (int)(R))) + +#define _mm_mask_fmsub_round_sh(W, U, A, B, R) \ + ((__m128h)__builtin_ia32_vfmaddsh3_mask( \ + (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_fmsub_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { + return (__m128h)__builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B, + -(__v8hf)__C, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_maskz_fmsub_round_sh(U, A, B, C, R) \ + ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \ + (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \ + (__mmask8)(U), (int)R)) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask3_fmsub_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) { + return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)__Y, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_mask3_fmsub_round_sh(W, X, Y, U, R) \ + ((__m128h)__builtin_ia32_vfmsubsh3_mask3( \ + (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmadd_sh(__m128h __W, + __m128h __A, + __m128h __B) { + return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B, + (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask_fnmadd_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B, + (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_fnmadd_round_sh(A, B, C, R) \ + ((__m128h)__builtin_ia32_vfmaddsh3_mask( \ + (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \ + (__mmask8)-1, (int)(R))) + +#define _mm_mask_fnmadd_round_sh(W, U, A, B, R) \ + ((__m128h)__builtin_ia32_vfmaddsh3_mask( \ + (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_fnmadd_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { + return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, (__v8hf)__C, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_maskz_fnmadd_round_sh(U, A, B, C, R) \ + ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \ + (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask3_fnmadd_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) { + return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)__Y, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_mask3_fnmadd_round_sh(W, X, Y, U, R) \ + ((__m128h)__builtin_ia32_vfmaddsh3_mask3( \ + (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmsub_sh(__m128h __W, + __m128h __A, + __m128h __B) { + return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B, + (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask_fnmsub_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B, + (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_fnmsub_round_sh(A, B, C, R) \ + ((__m128h)__builtin_ia32_vfmaddsh3_mask( \ + (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \ + (__mmask8)-1, (int)(R))) + +#define _mm_mask_fnmsub_round_sh(W, U, A, B, R) \ + ((__m128h)__builtin_ia32_vfmaddsh3_mask( \ + (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_fnmsub_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { + return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_maskz_fnmsub_round_sh(U, A, B, C, R) \ + ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \ + (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask3_fnmsub_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) { + return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)__Y, + (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_mask3_fnmsub_round_sh(W, X, Y, U, R) \ + ((__m128h)__builtin_ia32_vfmsubsh3_mask3( \ + (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmadd_sch(__m128h __A, + __m128h __B, + __m128h __C) { + return (__m128h)__builtin_ia32_vfcmaddcsh_mask((__v4sf)__A, (__v4sf)__B, + (__v4sf)__C, (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask_fcmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { + return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask( + (__v4sf)__A, (__v4sf)(__B), (__v4sf)(__C), __U, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_fcmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { + return (__m128h)__builtin_ia32_vfcmaddcsh_maskz((__v4sf)__A, (__v4sf)__B, + (__v4sf)__C, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask3_fcmadd_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { + return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask3( + (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, __U, _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_fcmadd_round_sch(A, B, C, R) \ + ((__m128h)__builtin_ia32_vfcmaddcsh_mask( \ + (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \ + (__mmask8)-1, (int)(R))) + +#define _mm_mask_fcmadd_round_sch(A, U, B, C, R) \ + ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask( \ + (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \ + (__mmask8)(U), (int)(R))) + +#define _mm_maskz_fcmadd_round_sch(U, A, B, C, R) \ + ((__m128h)__builtin_ia32_vfcmaddcsh_maskz( \ + (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \ + (__mmask8)(U), (int)(R))) + +#define _mm_mask3_fcmadd_round_sch(A, B, C, U, R) \ + ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask3( \ + (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_sch(__m128h __A, + __m128h __B, + __m128h __C) { + return (__m128h)__builtin_ia32_vfmaddcsh_mask((__v4sf)__A, (__v4sf)__B, + (__v4sf)__C, (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask_fmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { + return (__m128h)__builtin_ia32_vfmaddcsh_round_mask( + (__v4sf)__A, (__v4sf)(__B), (__v4sf)(__C), __U, _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_fmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { + return (__m128h)__builtin_ia32_vfmaddcsh_maskz((__v4sf)__A, (__v4sf)__B, + (__v4sf)__C, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask3_fmadd_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { + return (__m128h)__builtin_ia32_vfmaddcsh_round_mask3( + (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, __U, _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_fmadd_round_sch(A, B, C, R) \ + ((__m128h)__builtin_ia32_vfmaddcsh_mask( \ + (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \ + (__mmask8)-1, (int)(R))) + +#define _mm_mask_fmadd_round_sch(A, U, B, C, R) \ + ((__m128h)__builtin_ia32_vfmaddcsh_round_mask( \ + (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \ + (__mmask8)(U), (int)(R))) + +#define _mm_maskz_fmadd_round_sch(U, A, B, C, R) \ + ((__m128h)__builtin_ia32_vfmaddcsh_maskz( \ + (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \ + (__mmask8)(U), (int)(R))) + +#define _mm_mask3_fmadd_round_sch(A, B, C, U, R) \ + ((__m128h)__builtin_ia32_vfmaddcsh_round_mask3( \ + (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \ + (__mmask8)(U), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmul_sch(__m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_vfcmulcsh_mask( + (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask_fcmul_sch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + return (__m128h)__builtin_ia32_vfcmulcsh_mask((__v4sf)__A, (__v4sf)__B, + (__v4sf)__W, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_fcmul_sch(__mmask8 __U, __m128h __A, __m128h __B) { + return (__m128h)__builtin_ia32_vfcmulcsh_mask( + (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_fcmul_round_sch(A, B, R) \ + ((__m128h)__builtin_ia32_vfcmulcsh_mask( \ + (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \ + (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R))) + +#define _mm_mask_fcmul_round_sch(W, U, A, B, R) \ + ((__m128h)__builtin_ia32_vfcmulcsh_mask( \ + (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm_maskz_fcmul_round_sch(U, A, B, R) \ + ((__m128h)__builtin_ia32_vfcmulcsh_mask( \ + (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \ + (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmul_sch(__m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_vfmulcsh_mask( + (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmul_sch(__m128h __W, + __mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_vfmulcsh_mask((__v4sf)__A, (__v4sf)__B, + (__v4sf)__W, (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_fmul_sch(__mmask8 __U, __m128h __A, __m128h __B) { + return (__m128h)__builtin_ia32_vfmulcsh_mask( + (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm_fmul_round_sch(A, B, R) \ + ((__m128h)__builtin_ia32_vfmulcsh_mask( \ + (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \ + (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R))) + +#define _mm_mask_fmul_round_sch(W, U, A, B, R) \ + ((__m128h)__builtin_ia32_vfmulcsh_mask( \ + (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W), \ + (__mmask8)(U), (int)(R))) + +#define _mm_maskz_fmul_round_sch(U, A, B, R) \ + ((__m128h)__builtin_ia32_vfmulcsh_mask( \ + (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \ + (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R))) + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fcmul_pch(__m512h __A, + __m512h __B) { + return (__m512h)__builtin_ia32_vfcmulcph512_mask( + (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (__mmask16)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_fcmul_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) { + return (__m512h)__builtin_ia32_vfcmulcph512_mask((__v16sf)__A, (__v16sf)__B, + (__v16sf)__W, (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_maskz_fcmul_pch(__mmask16 __U, __m512h __A, __m512h __B) { + return (__m512h)__builtin_ia32_vfcmulcph512_mask( + (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_fcmul_round_pch(A, B, R) \ + ((__m512h)__builtin_ia32_vfcmulcph512_mask( \ + (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \ + (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R))) + +#define _mm512_mask_fcmul_round_pch(W, U, A, B, R) \ + ((__m512h)__builtin_ia32_vfcmulcph512_mask( \ + (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W), \ + (__mmask16)(U), (int)(R))) + +#define _mm512_maskz_fcmul_round_pch(U, A, B, R) \ + ((__m512h)__builtin_ia32_vfcmulcph512_mask( \ + (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \ + (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R))) + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmul_pch(__m512h __A, + __m512h __B) { + return (__m512h)__builtin_ia32_vfmulcph512_mask( + (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (__mmask16)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_fmul_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) { + return (__m512h)__builtin_ia32_vfmulcph512_mask((__v16sf)__A, (__v16sf)__B, + (__v16sf)__W, (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_maskz_fmul_pch(__mmask16 __U, __m512h __A, __m512h __B) { + return (__m512h)__builtin_ia32_vfmulcph512_mask( + (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_fmul_round_pch(A, B, R) \ + ((__m512h)__builtin_ia32_vfmulcph512_mask( \ + (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \ + (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R))) + +#define _mm512_mask_fmul_round_pch(W, U, A, B, R) \ + ((__m512h)__builtin_ia32_vfmulcph512_mask( \ + (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W), \ + (__mmask16)(U), (int)(R))) + +#define _mm512_maskz_fmul_round_pch(U, A, B, R) \ + ((__m512h)__builtin_ia32_vfmulcph512_mask( \ + (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \ + (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R))) + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fcmadd_pch(__m512h __A, + __m512h __B, + __m512h __C) { + return (__m512h)__builtin_ia32_vfcmaddcph512_mask3( + (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_fcmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) { + return (__m512h)__builtin_ia32_vfcmaddcph512_mask( + (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask3_fcmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) { + return (__m512h)__builtin_ia32_vfcmaddcph512_mask3( + (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_maskz_fcmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) { + return (__m512h)__builtin_ia32_vfcmaddcph512_maskz( + (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_fcmadd_round_pch(A, B, C, R) \ + ((__m512h)__builtin_ia32_vfcmaddcph512_mask3( \ + (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \ + (__mmask16)-1, (int)(R))) + +#define _mm512_mask_fcmadd_round_pch(A, U, B, C, R) \ + ((__m512h)__builtin_ia32_vfcmaddcph512_mask( \ + (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \ + (__mmask16)(U), (int)(R))) + +#define _mm512_mask3_fcmadd_round_pch(A, B, C, U, R) \ + ((__m512h)__builtin_ia32_vfcmaddcph512_mask3( \ + (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \ + (__mmask16)(U), (int)(R))) + +#define _mm512_maskz_fcmadd_round_pch(U, A, B, C, R) \ + ((__m512h)__builtin_ia32_vfcmaddcph512_maskz( \ + (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \ + (__mmask16)(U), (int)(R))) + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmadd_pch(__m512h __A, + __m512h __B, + __m512h __C) { + return (__m512h)__builtin_ia32_vfmaddcph512_mask3((__v16sf)__A, (__v16sf)__B, + (__v16sf)__C, (__mmask16)-1, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_fmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) { + return (__m512h)__builtin_ia32_vfmaddcph512_mask((__v16sf)__A, (__v16sf)__B, + (__v16sf)__C, (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask3_fmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) { + return (__m512h)__builtin_ia32_vfmaddcph512_mask3( + (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_maskz_fmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) { + return (__m512h)__builtin_ia32_vfmaddcph512_maskz( + (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, + _MM_FROUND_CUR_DIRECTION); +} + +#define _mm512_fmadd_round_pch(A, B, C, R) \ + ((__m512h)__builtin_ia32_vfmaddcph512_mask3( \ + (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \ + (__mmask16)-1, (int)(R))) + +#define _mm512_mask_fmadd_round_pch(A, U, B, C, R) \ + ((__m512h)__builtin_ia32_vfmaddcph512_mask( \ + (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \ + (__mmask16)(U), (int)(R))) + +#define _mm512_mask3_fmadd_round_pch(A, B, C, U, R) \ + ((__m512h)__builtin_ia32_vfmaddcph512_mask3( \ + (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \ + (__mmask16)(U), (int)(R))) + +#define _mm512_maskz_fmadd_round_pch(U, A, B, C, R) \ + ((__m512h)__builtin_ia32_vfmaddcph512_maskz( \ + (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \ + (__mmask16)(U), (int)(R))) + +static __inline__ _Float16 __DEFAULT_FN_ATTRS512 +_mm512_reduce_add_ph(__m512h __W) { + return __builtin_ia32_reduce_fadd_ph512(-0.0f16, __W); +} + +static __inline__ _Float16 __DEFAULT_FN_ATTRS512 +_mm512_reduce_mul_ph(__m512h __W) { + return __builtin_ia32_reduce_fmul_ph512(1.0f16, __W); +} + +static __inline__ _Float16 __DEFAULT_FN_ATTRS512 +_mm512_reduce_max_ph(__m512h __V) { + return __builtin_ia32_reduce_fmax_ph512(__V); +} + +static __inline__ _Float16 __DEFAULT_FN_ATTRS512 +_mm512_reduce_min_ph(__m512h __V) { + return __builtin_ia32_reduce_fmin_ph512(__V); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_mask_blend_ph(__mmask32 __U, __m512h __A, __m512h __W) { + return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, (__v32hf)__W, + (__v32hf)__A); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_permutex2var_ph(__m512h __A, __m512i __I, __m512h __B) { + return (__m512h)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I, + (__v32hi)__B); +} + +static __inline__ __m512h __DEFAULT_FN_ATTRS512 +_mm512_permutexvar_ph(__m512i __A, __m512h __B) { + return (__m512h)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A); +} + +// intrinsics below are alias for f*mul_*ch +#define _mm512_mul_pch(A, B) _mm512_fmul_pch(A, B) +#define _mm512_mask_mul_pch(W, U, A, B) _mm512_mask_fmul_pch(W, U, A, B) +#define _mm512_maskz_mul_pch(U, A, B) _mm512_maskz_fmul_pch(U, A, B) +#define _mm512_mul_round_pch(A, B, R) _mm512_fmul_round_pch(A, B, R) +#define _mm512_mask_mul_round_pch(W, U, A, B, R) \ + _mm512_mask_fmul_round_pch(W, U, A, B, R) +#define _mm512_maskz_mul_round_pch(U, A, B, R) \ + _mm512_maskz_fmul_round_pch(U, A, B, R) + +#define _mm512_cmul_pch(A, B) _mm512_fcmul_pch(A, B) +#define _mm512_mask_cmul_pch(W, U, A, B) _mm512_mask_fcmul_pch(W, U, A, B) +#define _mm512_maskz_cmul_pch(U, A, B) _mm512_maskz_fcmul_pch(U, A, B) +#define _mm512_cmul_round_pch(A, B, R) _mm512_fcmul_round_pch(A, B, R) +#define _mm512_mask_cmul_round_pch(W, U, A, B, R) \ + _mm512_mask_fcmul_round_pch(W, U, A, B, R) +#define _mm512_maskz_cmul_round_pch(U, A, B, R) \ + _mm512_maskz_fcmul_round_pch(U, A, B, R) + +#define _mm_mul_sch(A, B) _mm_fmul_sch(A, B) +#define _mm_mask_mul_sch(W, U, A, B) _mm_mask_fmul_sch(W, U, A, B) +#define _mm_maskz_mul_sch(U, A, B) _mm_maskz_fmul_sch(U, A, B) +#define _mm_mul_round_sch(A, B, R) _mm_fmul_round_sch(A, B, R) +#define _mm_mask_mul_round_sch(W, U, A, B, R) \ + _mm_mask_fmul_round_sch(W, U, A, B, R) +#define _mm_maskz_mul_round_sch(U, A, B, R) _mm_maskz_fmul_round_sch(U, A, B, R) + +#define _mm_cmul_sch(A, B) _mm_fcmul_sch(A, B) +#define _mm_mask_cmul_sch(W, U, A, B) _mm_mask_fcmul_sch(W, U, A, B) +#define _mm_maskz_cmul_sch(U, A, B) _mm_maskz_fcmul_sch(U, A, B) +#define _mm_cmul_round_sch(A, B, R) _mm_fcmul_round_sch(A, B, R) +#define _mm_mask_cmul_round_sch(W, U, A, B, R) \ + _mm_mask_fcmul_round_sch(W, U, A, B, R) +#define _mm_maskz_cmul_round_sch(U, A, B, R) \ + _mm_maskz_fcmul_round_sch(U, A, B, R) + +#undef __DEFAULT_FN_ATTRS128 +#undef __DEFAULT_FN_ATTRS256 +#undef __DEFAULT_FN_ATTRS512 + +#endif diff --git a/include-llvm/avx512ifmaintrin.h b/include-llvm/avx512ifmaintrin.h new file mode 100644 index 0000000..5f7da52 --- /dev/null +++ b/include-llvm/avx512ifmaintrin.h @@ -0,0 +1,68 @@ +/*===------------- avx512ifmaintrin.h - IFMA intrinsics ------------------=== + * + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __IFMAINTRIN_H +#define __IFMAINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma"), __min_vector_width__(512))) + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_madd52hi_epu64 (__m512i __X, __m512i __Y, __m512i __Z) +{ + return (__m512i)__builtin_ia32_vpmadd52huq512((__v8di) __X, (__v8di) __Y, + (__v8di) __Z); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_madd52hi_epu64 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) +{ + return (__m512i)__builtin_ia32_selectq_512(__M, + (__v8di)_mm512_madd52hi_epu64(__W, __X, __Y), + (__v8di)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_madd52hi_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z) +{ + return (__m512i)__builtin_ia32_selectq_512(__M, + (__v8di)_mm512_madd52hi_epu64(__X, __Y, __Z), + (__v8di)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_madd52lo_epu64 (__m512i __X, __m512i __Y, __m512i __Z) +{ + return (__m512i)__builtin_ia32_vpmadd52luq512((__v8di) __X, (__v8di) __Y, + (__v8di) __Z); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_madd52lo_epu64 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) +{ + return (__m512i)__builtin_ia32_selectq_512(__M, + (__v8di)_mm512_madd52lo_epu64(__W, __X, __Y), + (__v8di)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_madd52lo_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z) +{ + return (__m512i)__builtin_ia32_selectq_512(__M, + (__v8di)_mm512_madd52lo_epu64(__X, __Y, __Z), + (__v8di)_mm512_setzero_si512()); +} + +#undef __DEFAULT_FN_ATTRS + +#endif diff --git a/include-llvm/avx512ifmavlintrin.h b/include-llvm/avx512ifmavlintrin.h new file mode 100644 index 0000000..5889401 --- /dev/null +++ b/include-llvm/avx512ifmavlintrin.h @@ -0,0 +1,119 @@ +/*===------------- avx512ifmavlintrin.h - IFMA intrinsics ------------------=== + * + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __IFMAVLINTRIN_H +#define __IFMAVLINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma,avx512vl"), __min_vector_width__(128))) +#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma,avx512vl"), __min_vector_width__(256))) + + + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_madd52hi_epu64 (__m128i __X, __m128i __Y, __m128i __Z) +{ + return (__m128i)__builtin_ia32_vpmadd52huq128((__v2di) __X, (__v2di) __Y, + (__v2di) __Z); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_madd52hi_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__m128i)__builtin_ia32_selectq_128(__M, + (__v2di)_mm_madd52hi_epu64(__W, __X, __Y), + (__v2di)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_madd52hi_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z) +{ + return (__m128i)__builtin_ia32_selectq_128(__M, + (__v2di)_mm_madd52hi_epu64(__X, __Y, __Z), + (__v2di)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_madd52hi_epu64 (__m256i __X, __m256i __Y, __m256i __Z) +{ + return (__m256i)__builtin_ia32_vpmadd52huq256((__v4di)__X, (__v4di)__Y, + (__v4di)__Z); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_madd52hi_epu64 (__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__m256i)__builtin_ia32_selectq_256(__M, + (__v4di)_mm256_madd52hi_epu64(__W, __X, __Y), + (__v4di)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_madd52hi_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z) +{ + return (__m256i)__builtin_ia32_selectq_256(__M, + (__v4di)_mm256_madd52hi_epu64(__X, __Y, __Z), + (__v4di)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_madd52lo_epu64 (__m128i __X, __m128i __Y, __m128i __Z) +{ + return (__m128i)__builtin_ia32_vpmadd52luq128((__v2di)__X, (__v2di)__Y, + (__v2di)__Z); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_madd52lo_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__m128i)__builtin_ia32_selectq_128(__M, + (__v2di)_mm_madd52lo_epu64(__W, __X, __Y), + (__v2di)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_madd52lo_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z) +{ + return (__m128i)__builtin_ia32_selectq_128(__M, + (__v2di)_mm_madd52lo_epu64(__X, __Y, __Z), + (__v2di)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_madd52lo_epu64 (__m256i __X, __m256i __Y, __m256i __Z) +{ + return (__m256i)__builtin_ia32_vpmadd52luq256((__v4di)__X, (__v4di)__Y, + (__v4di)__Z); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_madd52lo_epu64 (__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__m256i)__builtin_ia32_selectq_256(__M, + (__v4di)_mm256_madd52lo_epu64(__W, __X, __Y), + (__v4di)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_madd52lo_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z) +{ + return (__m256i)__builtin_ia32_selectq_256(__M, + (__v4di)_mm256_madd52lo_epu64(__X, __Y, __Z), + (__v4di)_mm256_setzero_si256()); +} + + +#undef __DEFAULT_FN_ATTRS128 +#undef __DEFAULT_FN_ATTRS256 + +#endif diff --git a/include-llvm/avx512pfintrin.h b/include-llvm/avx512pfintrin.h new file mode 100644 index 0000000..b8bcf49 --- /dev/null +++ b/include-llvm/avx512pfintrin.h @@ -0,0 +1,97 @@ +/*===------------- avx512pfintrin.h - PF intrinsics ------------------------=== + * + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __AVX512PFINTRIN_H +#define __AVX512PFINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512pf"))) + +#define _mm512_mask_prefetch_i32gather_pd(index, mask, addr, scale, hint) \ + __builtin_ia32_gatherpfdpd((__mmask8)(mask), (__v8si)(__m256i)(index), \ + (void const *)(addr), (int)(scale), \ + (int)(hint)) + +#define _mm512_prefetch_i32gather_pd(index, addr, scale, hint) \ + __builtin_ia32_gatherpfdpd((__mmask8) -1, (__v8si)(__m256i)(index), \ + (void const *)(addr), (int)(scale), \ + (int)(hint)) + +#define _mm512_mask_prefetch_i32gather_ps(index, mask, addr, scale, hint) \ + __builtin_ia32_gatherpfdps((__mmask16)(mask), \ + (__v16si)(__m512i)(index), (void const *)(addr), \ + (int)(scale), (int)(hint)) + +#define _mm512_prefetch_i32gather_ps(index, addr, scale, hint) \ + __builtin_ia32_gatherpfdps((__mmask16) -1, \ + (__v16si)(__m512i)(index), (void const *)(addr), \ + (int)(scale), (int)(hint)) + +#define _mm512_mask_prefetch_i64gather_pd(index, mask, addr, scale, hint) \ + __builtin_ia32_gatherpfqpd((__mmask8)(mask), (__v8di)(__m512i)(index), \ + (void const *)(addr), (int)(scale), \ + (int)(hint)) + +#define _mm512_prefetch_i64gather_pd(index, addr, scale, hint) \ + __builtin_ia32_gatherpfqpd((__mmask8) -1, (__v8di)(__m512i)(index), \ + (void const *)(addr), (int)(scale), \ + (int)(hint)) + +#define _mm512_mask_prefetch_i64gather_ps(index, mask, addr, scale, hint) \ + __builtin_ia32_gatherpfqps((__mmask8)(mask), (__v8di)(__m512i)(index), \ + (void const *)(addr), (int)(scale), (int)(hint)) + +#define _mm512_prefetch_i64gather_ps(index, addr, scale, hint) \ + __builtin_ia32_gatherpfqps((__mmask8) -1, (__v8di)(__m512i)(index), \ + (void const *)(addr), (int)(scale), (int)(hint)) + +#define _mm512_prefetch_i32scatter_pd(addr, index, scale, hint) \ + __builtin_ia32_scatterpfdpd((__mmask8)-1, (__v8si)(__m256i)(index), \ + (void *)(addr), (int)(scale), \ + (int)(hint)) + +#define _mm512_mask_prefetch_i32scatter_pd(addr, mask, index, scale, hint) \ + __builtin_ia32_scatterpfdpd((__mmask8)(mask), (__v8si)(__m256i)(index), \ + (void *)(addr), (int)(scale), \ + (int)(hint)) + +#define _mm512_prefetch_i32scatter_ps(addr, index, scale, hint) \ + __builtin_ia32_scatterpfdps((__mmask16)-1, (__v16si)(__m512i)(index), \ + (void *)(addr), (int)(scale), (int)(hint)) + +#define _mm512_mask_prefetch_i32scatter_ps(addr, mask, index, scale, hint) \ + __builtin_ia32_scatterpfdps((__mmask16)(mask), \ + (__v16si)(__m512i)(index), (void *)(addr), \ + (int)(scale), (int)(hint)) + +#define _mm512_prefetch_i64scatter_pd(addr, index, scale, hint) \ + __builtin_ia32_scatterpfqpd((__mmask8)-1, (__v8di)(__m512i)(index), \ + (void *)(addr), (int)(scale), \ + (int)(hint)) + +#define _mm512_mask_prefetch_i64scatter_pd(addr, mask, index, scale, hint) \ + __builtin_ia32_scatterpfqpd((__mmask8)(mask), (__v8di)(__m512i)(index), \ + (void *)(addr), (int)(scale), \ + (int)(hint)) + +#define _mm512_prefetch_i64scatter_ps(addr, index, scale, hint) \ + __builtin_ia32_scatterpfqps((__mmask8)-1, (__v8di)(__m512i)(index), \ + (void *)(addr), (int)(scale), (int)(hint)) + +#define _mm512_mask_prefetch_i64scatter_ps(addr, mask, index, scale, hint) \ + __builtin_ia32_scatterpfqps((__mmask8)(mask), (__v8di)(__m512i)(index), \ + (void *)(addr), (int)(scale), (int)(hint)) + +#undef __DEFAULT_FN_ATTRS + +#endif diff --git a/include-llvm/avx512vbmi2intrin.h b/include-llvm/avx512vbmi2intrin.h new file mode 100644 index 0000000..17fa777 --- /dev/null +++ b/include-llvm/avx512vbmi2intrin.h @@ -0,0 +1,357 @@ +/*===------------- avx512vbmi2intrin.h - VBMI2 intrinsics ------------------=== + * + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __AVX512VBMI2INTRIN_H +#define __AVX512VBMI2INTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi2"), __min_vector_width__(512))) + + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_compress_epi16(__m512i __S, __mmask32 __U, __m512i __D) +{ + return (__m512i) __builtin_ia32_compresshi512_mask ((__v32hi) __D, + (__v32hi) __S, + __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_compress_epi16(__mmask32 __U, __m512i __D) +{ + return (__m512i) __builtin_ia32_compresshi512_mask ((__v32hi) __D, + (__v32hi) _mm512_setzero_si512(), + __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_compress_epi8(__m512i __S, __mmask64 __U, __m512i __D) +{ + return (__m512i) __builtin_ia32_compressqi512_mask ((__v64qi) __D, + (__v64qi) __S, + __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_compress_epi8(__mmask64 __U, __m512i __D) +{ + return (__m512i) __builtin_ia32_compressqi512_mask ((__v64qi) __D, + (__v64qi) _mm512_setzero_si512(), + __U); +} + +static __inline__ void __DEFAULT_FN_ATTRS +_mm512_mask_compressstoreu_epi16(void *__P, __mmask32 __U, __m512i __D) +{ + __builtin_ia32_compressstorehi512_mask ((__v32hi *) __P, (__v32hi) __D, + __U); +} + +static __inline__ void __DEFAULT_FN_ATTRS +_mm512_mask_compressstoreu_epi8(void *__P, __mmask64 __U, __m512i __D) +{ + __builtin_ia32_compressstoreqi512_mask ((__v64qi *) __P, (__v64qi) __D, + __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_expand_epi16(__m512i __S, __mmask32 __U, __m512i __D) +{ + return (__m512i) __builtin_ia32_expandhi512_mask ((__v32hi) __D, + (__v32hi) __S, + __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_expand_epi16(__mmask32 __U, __m512i __D) +{ + return (__m512i) __builtin_ia32_expandhi512_mask ((__v32hi) __D, + (__v32hi) _mm512_setzero_si512(), + __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_expand_epi8(__m512i __S, __mmask64 __U, __m512i __D) +{ + return (__m512i) __builtin_ia32_expandqi512_mask ((__v64qi) __D, + (__v64qi) __S, + __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_expand_epi8(__mmask64 __U, __m512i __D) +{ + return (__m512i) __builtin_ia32_expandqi512_mask ((__v64qi) __D, + (__v64qi) _mm512_setzero_si512(), + __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_expandloadu_epi16(__m512i __S, __mmask32 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_expandloadhi512_mask ((const __v32hi *)__P, + (__v32hi) __S, + __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_expandloadu_epi16(__mmask32 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_expandloadhi512_mask ((const __v32hi *)__P, + (__v32hi) _mm512_setzero_si512(), + __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_expandloadu_epi8(__m512i __S, __mmask64 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_expandloadqi512_mask ((const __v64qi *)__P, + (__v64qi) __S, + __U); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_expandloadu_epi8(__mmask64 __U, void const *__P) +{ + return (__m512i) __builtin_ia32_expandloadqi512_mask ((const __v64qi *)__P, + (__v64qi) _mm512_setzero_si512(), + __U); +} + +#define _mm512_shldi_epi64(A, B, I) \ + ((__m512i)__builtin_ia32_vpshldq512((__v8di)(__m512i)(A), \ + (__v8di)(__m512i)(B), (int)(I))) + +#define _mm512_mask_shldi_epi64(S, U, A, B, I) \ + ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ + (__v8di)_mm512_shldi_epi64((A), (B), (I)), \ + (__v8di)(__m512i)(S))) + +#define _mm512_maskz_shldi_epi64(U, A, B, I) \ + ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ + (__v8di)_mm512_shldi_epi64((A), (B), (I)), \ + (__v8di)_mm512_setzero_si512())) + +#define _mm512_shldi_epi32(A, B, I) \ + ((__m512i)__builtin_ia32_vpshldd512((__v16si)(__m512i)(A), \ + (__v16si)(__m512i)(B), (int)(I))) + +#define _mm512_mask_shldi_epi32(S, U, A, B, I) \ + ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ + (__v16si)_mm512_shldi_epi32((A), (B), (I)), \ + (__v16si)(__m512i)(S))) + +#define _mm512_maskz_shldi_epi32(U, A, B, I) \ + ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ + (__v16si)_mm512_shldi_epi32((A), (B), (I)), \ + (__v16si)_mm512_setzero_si512())) + +#define _mm512_shldi_epi16(A, B, I) \ + ((__m512i)__builtin_ia32_vpshldw512((__v32hi)(__m512i)(A), \ + (__v32hi)(__m512i)(B), (int)(I))) + +#define _mm512_mask_shldi_epi16(S, U, A, B, I) \ + ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ + (__v32hi)_mm512_shldi_epi16((A), (B), (I)), \ + (__v32hi)(__m512i)(S))) + +#define _mm512_maskz_shldi_epi16(U, A, B, I) \ + ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ + (__v32hi)_mm512_shldi_epi16((A), (B), (I)), \ + (__v32hi)_mm512_setzero_si512())) + +#define _mm512_shrdi_epi64(A, B, I) \ + ((__m512i)__builtin_ia32_vpshrdq512((__v8di)(__m512i)(A), \ + (__v8di)(__m512i)(B), (int)(I))) + +#define _mm512_mask_shrdi_epi64(S, U, A, B, I) \ + ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ + (__v8di)_mm512_shrdi_epi64((A), (B), (I)), \ + (__v8di)(__m512i)(S))) + +#define _mm512_maskz_shrdi_epi64(U, A, B, I) \ + ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ + (__v8di)_mm512_shrdi_epi64((A), (B), (I)), \ + (__v8di)_mm512_setzero_si512())) + +#define _mm512_shrdi_epi32(A, B, I) \ + ((__m512i)__builtin_ia32_vpshrdd512((__v16si)(__m512i)(A), \ + (__v16si)(__m512i)(B), (int)(I))) + +#define _mm512_mask_shrdi_epi32(S, U, A, B, I) \ + ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ + (__v16si)_mm512_shrdi_epi32((A), (B), (I)), \ + (__v16si)(__m512i)(S))) + +#define _mm512_maskz_shrdi_epi32(U, A, B, I) \ + ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ + (__v16si)_mm512_shrdi_epi32((A), (B), (I)), \ + (__v16si)_mm512_setzero_si512())) + +#define _mm512_shrdi_epi16(A, B, I) \ + ((__m512i)__builtin_ia32_vpshrdw512((__v32hi)(__m512i)(A), \ + (__v32hi)(__m512i)(B), (int)(I))) + +#define _mm512_mask_shrdi_epi16(S, U, A, B, I) \ + ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ + (__v32hi)_mm512_shrdi_epi16((A), (B), (I)), \ + (__v32hi)(__m512i)(S))) + +#define _mm512_maskz_shrdi_epi16(U, A, B, I) \ + ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ + (__v32hi)_mm512_shrdi_epi16((A), (B), (I)), \ + (__v32hi)_mm512_setzero_si512())) + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_shldv_epi64(__m512i __A, __m512i __B, __m512i __C) +{ + return (__m512i)__builtin_ia32_vpshldvq512((__v8di)__A, (__v8di)__B, + (__v8di)__C); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_shldv_epi64(__m512i __A, __mmask8 __U, __m512i __B, __m512i __C) +{ + return (__m512i)__builtin_ia32_selectq_512(__U, + (__v8di)_mm512_shldv_epi64(__A, __B, __C), + (__v8di)__A); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_shldv_epi64(__mmask8 __U, __m512i __A, __m512i __B, __m512i __C) +{ + return (__m512i)__builtin_ia32_selectq_512(__U, + (__v8di)_mm512_shldv_epi64(__A, __B, __C), + (__v8di)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_shldv_epi32(__m512i __A, __m512i __B, __m512i __C) +{ + return (__m512i)__builtin_ia32_vpshldvd512((__v16si)__A, (__v16si)__B, + (__v16si)__C); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_shldv_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) +{ + return (__m512i)__builtin_ia32_selectd_512(__U, + (__v16si)_mm512_shldv_epi32(__A, __B, __C), + (__v16si)__A); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_shldv_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C) +{ + return (__m512i)__builtin_ia32_selectd_512(__U, + (__v16si)_mm512_shldv_epi32(__A, __B, __C), + (__v16si)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_shldv_epi16(__m512i __A, __m512i __B, __m512i __C) +{ + return (__m512i)__builtin_ia32_vpshldvw512((__v32hi)__A, (__v32hi)__B, + (__v32hi)__C); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_shldv_epi16(__m512i __A, __mmask32 __U, __m512i __B, __m512i __C) +{ + return (__m512i)__builtin_ia32_selectw_512(__U, + (__v32hi)_mm512_shldv_epi16(__A, __B, __C), + (__v32hi)__A); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_shldv_epi16(__mmask32 __U, __m512i __A, __m512i __B, __m512i __C) +{ + return (__m512i)__builtin_ia32_selectw_512(__U, + (__v32hi)_mm512_shldv_epi16(__A, __B, __C), + (__v32hi)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_shrdv_epi64(__m512i __A, __m512i __B, __m512i __C) +{ + return (__m512i)__builtin_ia32_vpshrdvq512((__v8di)__A, (__v8di)__B, + (__v8di)__C); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_shrdv_epi64(__m512i __A, __mmask8 __U, __m512i __B, __m512i __C) +{ + return (__m512i)__builtin_ia32_selectq_512(__U, + (__v8di)_mm512_shrdv_epi64(__A, __B, __C), + (__v8di)__A); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_shrdv_epi64(__mmask8 __U, __m512i __A, __m512i __B, __m512i __C) +{ + return (__m512i)__builtin_ia32_selectq_512(__U, + (__v8di)_mm512_shrdv_epi64(__A, __B, __C), + (__v8di)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_shrdv_epi32(__m512i __A, __m512i __B, __m512i __C) +{ + return (__m512i)__builtin_ia32_vpshrdvd512((__v16si)__A, (__v16si)__B, + (__v16si)__C); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_shrdv_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) +{ + return (__m512i) __builtin_ia32_selectd_512(__U, + (__v16si)_mm512_shrdv_epi32(__A, __B, __C), + (__v16si)__A); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_shrdv_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C) +{ + return (__m512i) __builtin_ia32_selectd_512(__U, + (__v16si)_mm512_shrdv_epi32(__A, __B, __C), + (__v16si)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_shrdv_epi16(__m512i __A, __m512i __B, __m512i __C) +{ + return (__m512i)__builtin_ia32_vpshrdvw512((__v32hi)__A, (__v32hi)__B, + (__v32hi)__C); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_shrdv_epi16(__m512i __A, __mmask32 __U, __m512i __B, __m512i __C) +{ + return (__m512i)__builtin_ia32_selectw_512(__U, + (__v32hi)_mm512_shrdv_epi16(__A, __B, __C), + (__v32hi)__A); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_shrdv_epi16(__mmask32 __U, __m512i __A, __m512i __B, __m512i __C) +{ + return (__m512i)__builtin_ia32_selectw_512(__U, + (__v32hi)_mm512_shrdv_epi16(__A, __B, __C), + (__v32hi)_mm512_setzero_si512()); +} + + +#undef __DEFAULT_FN_ATTRS + +#endif + diff --git a/include-llvm/avx512vbmiintrin.h b/include-llvm/avx512vbmiintrin.h new file mode 100644 index 0000000..c0e0f94 --- /dev/null +++ b/include-llvm/avx512vbmiintrin.h @@ -0,0 +1,105 @@ +/*===------------- avx512vbmiintrin.h - VBMI intrinsics ------------------=== + * + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __VBMIINTRIN_H +#define __VBMIINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi"), __min_vector_width__(512))) + + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_permutex2var_epi8(__m512i __A, __m512i __I, __m512i __B) +{ + return (__m512i)__builtin_ia32_vpermi2varqi512((__v64qi)__A, (__v64qi)__I, + (__v64qi) __B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_permutex2var_epi8(__m512i __A, __mmask64 __U, __m512i __I, + __m512i __B) +{ + return (__m512i)__builtin_ia32_selectb_512(__U, + (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B), + (__v64qi)__A); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask2_permutex2var_epi8(__m512i __A, __m512i __I, __mmask64 __U, + __m512i __B) +{ + return (__m512i)__builtin_ia32_selectb_512(__U, + (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B), + (__v64qi)__I); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_permutex2var_epi8(__mmask64 __U, __m512i __A, __m512i __I, + __m512i __B) +{ + return (__m512i)__builtin_ia32_selectb_512(__U, + (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B), + (__v64qi)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_permutexvar_epi8 (__m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_permvarqi512((__v64qi) __B, (__v64qi) __A); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_permutexvar_epi8 (__mmask64 __M, __m512i __A, + __m512i __B) +{ + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, + (__v64qi)_mm512_permutexvar_epi8(__A, __B), + (__v64qi)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_permutexvar_epi8 (__m512i __W, __mmask64 __M, __m512i __A, + __m512i __B) +{ + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, + (__v64qi)_mm512_permutexvar_epi8(__A, __B), + (__v64qi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_multishift_epi64_epi8(__m512i __X, __m512i __Y) +{ + return (__m512i)__builtin_ia32_vpmultishiftqb512((__v64qi)__X, (__v64qi) __Y); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_multishift_epi64_epi8(__m512i __W, __mmask64 __M, __m512i __X, + __m512i __Y) +{ + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, + (__v64qi)_mm512_multishift_epi64_epi8(__X, __Y), + (__v64qi)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_multishift_epi64_epi8(__mmask64 __M, __m512i __X, __m512i __Y) +{ + return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, + (__v64qi)_mm512_multishift_epi64_epi8(__X, __Y), + (__v64qi)_mm512_setzero_si512()); +} + + +#undef __DEFAULT_FN_ATTRS + +#endif diff --git a/include-llvm/avx512vbmivlintrin.h b/include-llvm/avx512vbmivlintrin.h new file mode 100644 index 0000000..c5b96ae --- /dev/null +++ b/include-llvm/avx512vbmivlintrin.h @@ -0,0 +1,188 @@ +/*===------------- avx512vbmivlintrin.h - VBMI intrinsics ------------------=== + * + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __VBMIVLINTRIN_H +#define __VBMIVLINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi,avx512vl"), __min_vector_width__(128))) +#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi,avx512vl"), __min_vector_width__(256))) + + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_permutex2var_epi8(__m128i __A, __m128i __I, __m128i __B) +{ + return (__m128i)__builtin_ia32_vpermi2varqi128((__v16qi)__A, + (__v16qi)__I, + (__v16qi)__B); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_permutex2var_epi8(__m128i __A, __mmask16 __U, __m128i __I, + __m128i __B) +{ + return (__m128i)__builtin_ia32_selectb_128(__U, + (__v16qi)_mm_permutex2var_epi8(__A, __I, __B), + (__v16qi)__A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask2_permutex2var_epi8(__m128i __A, __m128i __I, __mmask16 __U, + __m128i __B) +{ + return (__m128i)__builtin_ia32_selectb_128(__U, + (__v16qi)_mm_permutex2var_epi8(__A, __I, __B), + (__v16qi)__I); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_permutex2var_epi8(__mmask16 __U, __m128i __A, __m128i __I, + __m128i __B) +{ + return (__m128i)__builtin_ia32_selectb_128(__U, + (__v16qi)_mm_permutex2var_epi8(__A, __I, __B), + (__v16qi)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_permutex2var_epi8(__m256i __A, __m256i __I, __m256i __B) +{ + return (__m256i)__builtin_ia32_vpermi2varqi256((__v32qi)__A, (__v32qi)__I, + (__v32qi)__B); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_permutex2var_epi8(__m256i __A, __mmask32 __U, __m256i __I, + __m256i __B) +{ + return (__m256i)__builtin_ia32_selectb_256(__U, + (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B), + (__v32qi)__A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask2_permutex2var_epi8(__m256i __A, __m256i __I, __mmask32 __U, + __m256i __B) +{ + return (__m256i)__builtin_ia32_selectb_256(__U, + (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B), + (__v32qi)__I); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_permutex2var_epi8(__mmask32 __U, __m256i __A, __m256i __I, + __m256i __B) +{ + return (__m256i)__builtin_ia32_selectb_256(__U, + (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B), + (__v32qi)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_permutexvar_epi8 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_permvarqi128((__v16qi)__B, (__v16qi)__A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_permutexvar_epi8 (__mmask16 __M, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, + (__v16qi)_mm_permutexvar_epi8(__A, __B), + (__v16qi)_mm_setzero_si128()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_permutexvar_epi8 (__m128i __W, __mmask16 __M, __m128i __A, + __m128i __B) +{ + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, + (__v16qi)_mm_permutexvar_epi8(__A, __B), + (__v16qi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_permutexvar_epi8 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_permvarqi256((__v32qi) __B, (__v32qi) __A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_permutexvar_epi8 (__mmask32 __M, __m256i __A, + __m256i __B) +{ + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, + (__v32qi)_mm256_permutexvar_epi8(__A, __B), + (__v32qi)_mm256_setzero_si256()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_permutexvar_epi8 (__m256i __W, __mmask32 __M, __m256i __A, + __m256i __B) +{ + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, + (__v32qi)_mm256_permutexvar_epi8(__A, __B), + (__v32qi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_multishift_epi64_epi8(__m128i __X, __m128i __Y) +{ + return (__m128i)__builtin_ia32_vpmultishiftqb128((__v16qi)__X, (__v16qi)__Y); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_multishift_epi64_epi8(__m128i __W, __mmask16 __M, __m128i __X, + __m128i __Y) +{ + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, + (__v16qi)_mm_multishift_epi64_epi8(__X, __Y), + (__v16qi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_multishift_epi64_epi8(__mmask16 __M, __m128i __X, __m128i __Y) +{ + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, + (__v16qi)_mm_multishift_epi64_epi8(__X, __Y), + (__v16qi)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_multishift_epi64_epi8(__m256i __X, __m256i __Y) +{ + return (__m256i)__builtin_ia32_vpmultishiftqb256((__v32qi)__X, (__v32qi)__Y); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_multishift_epi64_epi8(__m256i __W, __mmask32 __M, __m256i __X, + __m256i __Y) +{ + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, + (__v32qi)_mm256_multishift_epi64_epi8(__X, __Y), + (__v32qi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_multishift_epi64_epi8(__mmask32 __M, __m256i __X, __m256i __Y) +{ + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, + (__v32qi)_mm256_multishift_epi64_epi8(__X, __Y), + (__v32qi)_mm256_setzero_si256()); +} + + +#undef __DEFAULT_FN_ATTRS128 +#undef __DEFAULT_FN_ATTRS256 + +#endif diff --git a/include-llvm/avx512vlbf16intrin.h b/include-llvm/avx512vlbf16intrin.h new file mode 100644 index 0000000..adc43c1 --- /dev/null +++ b/include-llvm/avx512vlbf16intrin.h @@ -0,0 +1,530 @@ +/*===--------- avx512vlbf16intrin.h - AVX512_BF16 intrinsics ---------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __AVX512VLBF16INTRIN_H +#define __AVX512VLBF16INTRIN_H + +#if (__clang_major__ <= 15) +typedef short __m128bh __attribute__((__vector_size__(16), __aligned__(16))); +#endif + +#define __DEFAULT_FN_ATTRS128 \ + __attribute__((__always_inline__, __nodebug__, \ + __target__("avx512vl, avx512bf16"), __min_vector_width__(128))) +#define __DEFAULT_FN_ATTRS256 \ + __attribute__((__always_inline__, __nodebug__, \ + __target__("avx512vl, avx512bf16"), __min_vector_width__(256))) + +/// Convert Two Packed Single Data to One Packed BF16 Data. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTNE2PS2BF16 instructions. +/// +/// \param __A +/// A 128-bit vector of [4 x float]. +/// \param __B +/// A 128-bit vector of [4 x float]. +/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from +/// conversion of __B, and higher 64 bits come from conversion of __A. +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_cvtne2ps_pbh(__m128 __A, __m128 __B) { + return (__m128bh)__builtin_ia32_cvtne2ps2bf16_128((__v4sf) __A, + (__v4sf) __B); +} + +/// Convert Two Packed Single Data to One Packed BF16 Data. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTNE2PS2BF16 instructions. +/// +/// \param __A +/// A 128-bit vector of [4 x float]. +/// \param __B +/// A 128-bit vector of [4 x float]. +/// \param __W +/// A 128-bit vector of [8 x bfloat]. +/// \param __U +/// A 8-bit mask value specifying what is chosen for each element. +/// A 1 means conversion of __A or __B. A 0 means element from __W. +/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from +/// conversion of __B, and higher 64 bits come from conversion of __A. +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_mask_cvtne2ps_pbh(__m128bh __W, __mmask8 __U, __m128 __A, __m128 __B) { + return (__m128bh)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_cvtne2ps_pbh(__A, __B), + (__v8hi)__W); +} + +/// Convert Two Packed Single Data to One Packed BF16 Data. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTNE2PS2BF16 instructions. +/// +/// \param __A +/// A 128-bit vector of [4 x float]. +/// \param __B +/// A 128-bit vector of [4 x float]. +/// \param __U +/// A 8-bit mask value specifying what is chosen for each element. +/// A 1 means conversion of __A or __B. A 0 means element is zero. +/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from +/// conversion of __B, and higher 64 bits come from conversion of __A. +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtne2ps_pbh(__mmask8 __U, __m128 __A, __m128 __B) { + return (__m128bh)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_cvtne2ps_pbh(__A, __B), + (__v8hi)_mm_setzero_si128()); +} + +/// Convert Two Packed Single Data to One Packed BF16 Data. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTNE2PS2BF16 instructions. +/// +/// \param __A +/// A 256-bit vector of [8 x float]. +/// \param __B +/// A 256-bit vector of [8 x float]. +/// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from +/// conversion of __B, and higher 128 bits come from conversion of __A. +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_cvtne2ps_pbh(__m256 __A, __m256 __B) { + return (__m256bh)__builtin_ia32_cvtne2ps2bf16_256((__v8sf) __A, + (__v8sf) __B); +} + +/// Convert Two Packed Single Data to One Packed BF16 Data. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTNE2PS2BF16 instructions. +/// +/// \param __A +/// A 256-bit vector of [8 x float]. +/// \param __B +/// A 256-bit vector of [8 x float]. +/// \param __W +/// A 256-bit vector of [16 x bfloat]. +/// \param __U +/// A 16-bit mask value specifying what is chosen for each element. +/// A 1 means conversion of __A or __B. A 0 means element from __W. +/// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from +/// conversion of __B, and higher 128 bits come from conversion of __A. +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtne2ps_pbh(__m256bh __W, __mmask16 __U, __m256 __A, __m256 __B) { + return (__m256bh)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_cvtne2ps_pbh(__A, __B), + (__v16hi)__W); +} + +/// Convert Two Packed Single Data to One Packed BF16 Data. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTNE2PS2BF16 instructions. +/// +/// \param __A +/// A 256-bit vector of [8 x float]. +/// \param __B +/// A 256-bit vector of [8 x float]. +/// \param __U +/// A 16-bit mask value specifying what is chosen for each element. +/// A 1 means conversion of __A or __B. A 0 means element is zero. +/// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from +/// conversion of __B, and higher 128 bits come from conversion of __A. +static __inline__ __m256bh __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtne2ps_pbh(__mmask16 __U, __m256 __A, __m256 __B) { + return (__m256bh)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_cvtne2ps_pbh(__A, __B), + (__v16hi)_mm256_setzero_si256()); +} + +/// Convert Packed Single Data to Packed BF16 Data. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTNEPS2BF16 instructions. +/// +/// \param __A +/// A 128-bit vector of [4 x float]. +/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from +/// conversion of __A, and higher 64 bits are 0. +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_cvtneps_pbh(__m128 __A) { + return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A, + (__v8hi)_mm_undefined_si128(), + (__mmask8)-1); +} + +/// Convert Packed Single Data to Packed BF16 Data. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTNEPS2BF16 instructions. +/// +/// \param __A +/// A 128-bit vector of [4 x float]. +/// \param __W +/// A 128-bit vector of [8 x bfloat]. +/// \param __U +/// A 4-bit mask value specifying what is chosen for each element. +/// A 1 means conversion of __A. A 0 means element from __W. +/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from +/// conversion of __A, and higher 64 bits are 0. +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m128 __A) { + return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A, + (__v8hi)__W, + (__mmask8)__U); +} + +/// Convert Packed Single Data to Packed BF16 Data. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTNEPS2BF16 instructions. +/// +/// \param __A +/// A 128-bit vector of [4 x float]. +/// \param __U +/// A 4-bit mask value specifying what is chosen for each element. +/// A 1 means conversion of __A. A 0 means element is zero. +/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from +/// conversion of __A, and higher 64 bits are 0. +static __inline__ __m128bh __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtneps_pbh(__mmask8 __U, __m128 __A) { + return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A, + (__v8hi)_mm_setzero_si128(), + (__mmask8)__U); +} + +/// Convert Packed Single Data to Packed BF16 Data. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTNEPS2BF16 instructions. +/// +/// \param __A +/// A 256-bit vector of [8 x float]. +/// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A. +static __inline__ __m128bh __DEFAULT_FN_ATTRS256 +_mm256_cvtneps_pbh(__m256 __A) { + return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A, + (__v8hi)_mm_undefined_si128(), + (__mmask8)-1); +} + +/// Convert Packed Single Data to Packed BF16 Data. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTNEPS2BF16 instructions. +/// +/// \param __A +/// A 256-bit vector of [8 x float]. +/// \param __W +/// A 256-bit vector of [8 x bfloat]. +/// \param __U +/// A 8-bit mask value specifying what is chosen for each element. +/// A 1 means conversion of __A. A 0 means element from __W. +/// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A. +static __inline__ __m128bh __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m256 __A) { + return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A, + (__v8hi)__W, + (__mmask8)__U); +} + +/// Convert Packed Single Data to Packed BF16 Data. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTNEPS2BF16 instructions. +/// +/// \param __A +/// A 256-bit vector of [8 x float]. +/// \param __U +/// A 8-bit mask value specifying what is chosen for each element. +/// A 1 means conversion of __A. A 0 means element is zero. +/// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A. +static __inline__ __m128bh __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtneps_pbh(__mmask8 __U, __m256 __A) { + return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A, + (__v8hi)_mm_setzero_si128(), + (__mmask8)__U); +} + +/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VDPBF16PS instructions. +/// +/// \param __A +/// A 128-bit vector of [8 x bfloat]. +/// \param __B +/// A 128-bit vector of [8 x bfloat]. +/// \param __D +/// A 128-bit vector of [4 x float]. +/// \returns A 128-bit vector of [4 x float] comes from Dot Product of +/// __A, __B and __D +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_dpbf16_ps(__m128 __D, __m128bh __A, __m128bh __B) { + return (__m128)__builtin_ia32_dpbf16ps_128((__v4sf)__D, + (__v4si)__A, + (__v4si)__B); +} + +/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VDPBF16PS instructions. +/// +/// \param __A +/// A 128-bit vector of [8 x bfloat]. +/// \param __B +/// A 128-bit vector of [8 x bfloat]. +/// \param __D +/// A 128-bit vector of [4 x float]. +/// \param __U +/// A 8-bit mask value specifying what is chosen for each element. +/// A 1 means __A and __B's dot product accumulated with __D. A 0 means __D. +/// \returns A 128-bit vector of [4 x float] comes from Dot Product of +/// __A, __B and __D +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask_dpbf16_ps(__m128 __D, __mmask8 __U, __m128bh __A, __m128bh __B) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_dpbf16_ps(__D, __A, __B), + (__v4sf)__D); +} + +/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VDPBF16PS instructions. +/// +/// \param __A +/// A 128-bit vector of [8 x bfloat]. +/// \param __B +/// A 128-bit vector of [8 x bfloat]. +/// \param __D +/// A 128-bit vector of [4 x float]. +/// \param __U +/// A 8-bit mask value specifying what is chosen for each element. +/// A 1 means __A and __B's dot product accumulated with __D. A 0 means 0. +/// \returns A 128-bit vector of [4 x float] comes from Dot Product of +/// __A, __B and __D +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_dpbf16_ps(__mmask8 __U, __m128 __D, __m128bh __A, __m128bh __B) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_dpbf16_ps(__D, __A, __B), + (__v4sf)_mm_setzero_si128()); +} + +/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VDPBF16PS instructions. +/// +/// \param __A +/// A 256-bit vector of [16 x bfloat]. +/// \param __B +/// A 256-bit vector of [16 x bfloat]. +/// \param __D +/// A 256-bit vector of [8 x float]. +/// \returns A 256-bit vector of [8 x float] comes from Dot Product of +/// __A, __B and __D +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_dpbf16_ps(__m256 __D, __m256bh __A, __m256bh __B) { + return (__m256)__builtin_ia32_dpbf16ps_256((__v8sf)__D, + (__v8si)__A, + (__v8si)__B); +} + +/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VDPBF16PS instructions. +/// +/// \param __A +/// A 256-bit vector of [16 x bfloat]. +/// \param __B +/// A 256-bit vector of [16 x bfloat]. +/// \param __D +/// A 256-bit vector of [8 x float]. +/// \param __U +/// A 16-bit mask value specifying what is chosen for each element. +/// A 1 means __A and __B's dot product accumulated with __D. A 0 means __D. +/// \returns A 256-bit vector of [8 x float] comes from Dot Product of +/// __A, __B and __D +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_mask_dpbf16_ps(__m256 __D, __mmask8 __U, __m256bh __A, __m256bh __B) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_dpbf16_ps(__D, __A, __B), + (__v8sf)__D); +} + +/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VDPBF16PS instructions. +/// +/// \param __A +/// A 256-bit vector of [16 x bfloat]. +/// \param __B +/// A 256-bit vector of [16 x bfloat]. +/// \param __D +/// A 256-bit vector of [8 x float]. +/// \param __U +/// A 8-bit mask value specifying what is chosen for each element. +/// A 1 means __A and __B's dot product accumulated with __D. A 0 means 0. +/// \returns A 256-bit vector of [8 x float] comes from Dot Product of +/// __A, __B and __D +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_maskz_dpbf16_ps(__mmask8 __U, __m256 __D, __m256bh __A, __m256bh __B) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_dpbf16_ps(__D, __A, __B), + (__v8sf)_mm256_setzero_si256()); +} + +/// Convert One Single float Data to One BF16 Data. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTNEPS2BF16 instructions. +/// +/// \param __A +/// A float data. +/// \returns A bf16 data whose sign field and exponent field keep unchanged, +/// and fraction field is truncated to 7 bits. +static __inline__ __bfloat16 __DEFAULT_FN_ATTRS128 _mm_cvtness_sbh(float __A) { + __v4sf __V = {__A, 0, 0, 0}; +#if (__clang_major__ > 15) + __v8bf __R = __builtin_ia32_cvtneps2bf16_128_mask( + (__v4sf)__V, (__v8bf)_mm_undefined_si128(), (__mmask8)-1); + return (__bf16)__R[0]; +#else + __v8hi __R = __builtin_ia32_cvtneps2bf16_128_mask( + (__v4sf)__V, (__v8hi)_mm_undefined_si128(), (__mmask8)-1); + return __R[0]; +#endif +} + +/// Convert Packed BF16 Data to Packed float Data. +/// +/// \headerfile +/// +/// \param __A +/// A 128-bit vector of [4 x bfloat]. +/// \returns A 128-bit vector of [4 x float] come from conversion of __A +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtpbh_ps(__m128bh __A) { + return _mm_castsi128_ps( + (__m128i)_mm_slli_epi32((__m128i)_mm_cvtepi16_epi32((__m128i)__A), 16)); +} + +/// Convert Packed BF16 Data to Packed float Data. +/// +/// \headerfile +/// +/// \param __A +/// A 128-bit vector of [8 x bfloat]. +/// \returns A 256-bit vector of [8 x float] come from conversion of __A +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtpbh_ps(__m128bh __A) { + return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32( + (__m256i)_mm256_cvtepi16_epi32((__m128i)__A), 16)); +} + +/// Convert Packed BF16 Data to Packed float Data using zeroing mask. +/// +/// \headerfile +/// +/// \param __U +/// A 4-bit mask. Elements are zeroed out when the corresponding mask +/// bit is not set. +/// \param __A +/// A 128-bit vector of [4 x bfloat]. +/// \returns A 128-bit vector of [4 x float] come from conversion of __A +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) { + return _mm_castsi128_ps((__m128i)_mm_slli_epi32( + (__m128i)_mm_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16)); +} + +/// Convert Packed BF16 Data to Packed float Data using zeroing mask. +/// +/// \headerfile +/// +/// \param __U +/// A 8-bit mask. Elements are zeroed out when the corresponding mask +/// bit is not set. +/// \param __A +/// A 128-bit vector of [8 x bfloat]. +/// \returns A 256-bit vector of [8 x float] come from conversion of __A +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) { + return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32( + (__m256i)_mm256_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16)); +} + +/// Convert Packed BF16 Data to Packed float Data using merging mask. +/// +/// \headerfile +/// +/// \param __S +/// A 128-bit vector of [4 x float]. Elements are copied from __S when +/// the corresponding mask bit is not set. +/// \param __U +/// A 4-bit mask. Elements are zeroed out when the corresponding mask +/// bit is not set. +/// \param __A +/// A 128-bit vector of [4 x bfloat]. +/// \returns A 128-bit vector of [4 x float] come from conversion of __A +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask_cvtpbh_ps(__m128 __S, __mmask8 __U, __m128bh __A) { + return _mm_castsi128_ps((__m128i)_mm_mask_slli_epi32( + (__m128i)__S, (__mmask8)__U, (__m128i)_mm_cvtepi16_epi32((__m128i)__A), + 16)); +} + +/// Convert Packed BF16 Data to Packed float Data using merging mask. +/// +/// \headerfile +/// +/// \param __S +/// A 256-bit vector of [8 x float]. Elements are copied from __S when +/// the corresponding mask bit is not set. +/// \param __U +/// A 8-bit mask. Elements are zeroed out when the corresponding mask +/// bit is not set. +/// \param __A +/// A 128-bit vector of [8 x bfloat]. +/// \returns A 256-bit vector of [8 x float] come from conversion of __A +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtpbh_ps(__m256 __S, __mmask8 __U, __m128bh __A) { + return _mm256_castsi256_ps((__m256i)_mm256_mask_slli_epi32( + (__m256i)__S, (__mmask8)__U, (__m256i)_mm256_cvtepi16_epi32((__m128i)__A), + 16)); +} + +#undef __DEFAULT_FN_ATTRS128 +#undef __DEFAULT_FN_ATTRS256 + +#endif diff --git a/include-llvm/avx512vlbitalgintrin.h b/include-llvm/avx512vlbitalgintrin.h new file mode 100644 index 0000000..5154eae --- /dev/null +++ b/include-llvm/avx512vlbitalgintrin.h @@ -0,0 +1,145 @@ +/*===---- avx512vlbitalgintrin.h - BITALG intrinsics -----------------------=== + * + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __AVX512VLBITALGINTRIN_H +#define __AVX512VLBITALGINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512bitalg"), __min_vector_width__(128))) +#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512bitalg"), __min_vector_width__(256))) + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_popcnt_epi16(__m256i __A) +{ + return (__m256i) __builtin_ia32_vpopcntw_256((__v16hi) __A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_popcnt_epi16(__m256i __A, __mmask16 __U, __m256i __B) +{ + return (__m256i) __builtin_ia32_selectw_256((__mmask16) __U, + (__v16hi) _mm256_popcnt_epi16(__B), + (__v16hi) __A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_popcnt_epi16(__mmask16 __U, __m256i __B) +{ + return _mm256_mask_popcnt_epi16((__m256i) _mm256_setzero_si256(), + __U, + __B); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_popcnt_epi16(__m128i __A) +{ + return (__m128i) __builtin_ia32_vpopcntw_128((__v8hi) __A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_popcnt_epi16(__m128i __A, __mmask8 __U, __m128i __B) +{ + return (__m128i) __builtin_ia32_selectw_128((__mmask8) __U, + (__v8hi) _mm_popcnt_epi16(__B), + (__v8hi) __A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_popcnt_epi16(__mmask8 __U, __m128i __B) +{ + return _mm_mask_popcnt_epi16((__m128i) _mm_setzero_si128(), + __U, + __B); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_popcnt_epi8(__m256i __A) +{ + return (__m256i) __builtin_ia32_vpopcntb_256((__v32qi) __A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_popcnt_epi8(__m256i __A, __mmask32 __U, __m256i __B) +{ + return (__m256i) __builtin_ia32_selectb_256((__mmask32) __U, + (__v32qi) _mm256_popcnt_epi8(__B), + (__v32qi) __A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_popcnt_epi8(__mmask32 __U, __m256i __B) +{ + return _mm256_mask_popcnt_epi8((__m256i) _mm256_setzero_si256(), + __U, + __B); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_popcnt_epi8(__m128i __A) +{ + return (__m128i) __builtin_ia32_vpopcntb_128((__v16qi) __A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_popcnt_epi8(__m128i __A, __mmask16 __U, __m128i __B) +{ + return (__m128i) __builtin_ia32_selectb_128((__mmask16) __U, + (__v16qi) _mm_popcnt_epi8(__B), + (__v16qi) __A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_popcnt_epi8(__mmask16 __U, __m128i __B) +{ + return _mm_mask_popcnt_epi8((__m128i) _mm_setzero_si128(), + __U, + __B); +} + +static __inline__ __mmask32 __DEFAULT_FN_ATTRS256 +_mm256_mask_bitshuffle_epi64_mask(__mmask32 __U, __m256i __A, __m256i __B) +{ + return (__mmask32) __builtin_ia32_vpshufbitqmb256_mask((__v32qi) __A, + (__v32qi) __B, + __U); +} + +static __inline__ __mmask32 __DEFAULT_FN_ATTRS256 +_mm256_bitshuffle_epi64_mask(__m256i __A, __m256i __B) +{ + return _mm256_mask_bitshuffle_epi64_mask((__mmask32) -1, + __A, + __B); +} + +static __inline__ __mmask16 __DEFAULT_FN_ATTRS128 +_mm_mask_bitshuffle_epi64_mask(__mmask16 __U, __m128i __A, __m128i __B) +{ + return (__mmask16) __builtin_ia32_vpshufbitqmb128_mask((__v16qi) __A, + (__v16qi) __B, + __U); +} + +static __inline__ __mmask16 __DEFAULT_FN_ATTRS128 +_mm_bitshuffle_epi64_mask(__m128i __A, __m128i __B) +{ + return _mm_mask_bitshuffle_epi64_mask((__mmask16) -1, + __A, + __B); +} + + +#undef __DEFAULT_FN_ATTRS128 +#undef __DEFAULT_FN_ATTRS256 + +#endif diff --git a/include-llvm/avx512vlbwintrin.h b/include-llvm/avx512vlbwintrin.h new file mode 100644 index 0000000..7873516 --- /dev/null +++ b/include-llvm/avx512vlbwintrin.h @@ -0,0 +1,2809 @@ +/*===---- avx512vlbwintrin.h - AVX512VL and AVX512BW intrinsics ------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __AVX512VLBWINTRIN_H +#define __AVX512VLBWINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512bw"), __min_vector_width__(128))) +#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512bw"), __min_vector_width__(256))) + +/* Integer compare */ + +#define _mm_cmp_epi8_mask(a, b, p) \ + ((__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \ + (__v16qi)(__m128i)(b), (int)(p), \ + (__mmask16)-1)) + +#define _mm_mask_cmp_epi8_mask(m, a, b, p) \ + ((__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \ + (__v16qi)(__m128i)(b), (int)(p), \ + (__mmask16)(m))) + +#define _mm_cmp_epu8_mask(a, b, p) \ + ((__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \ + (__v16qi)(__m128i)(b), (int)(p), \ + (__mmask16)-1)) + +#define _mm_mask_cmp_epu8_mask(m, a, b, p) \ + ((__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \ + (__v16qi)(__m128i)(b), (int)(p), \ + (__mmask16)(m))) + +#define _mm256_cmp_epi8_mask(a, b, p) \ + ((__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \ + (__v32qi)(__m256i)(b), (int)(p), \ + (__mmask32)-1)) + +#define _mm256_mask_cmp_epi8_mask(m, a, b, p) \ + ((__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \ + (__v32qi)(__m256i)(b), (int)(p), \ + (__mmask32)(m))) + +#define _mm256_cmp_epu8_mask(a, b, p) \ + ((__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \ + (__v32qi)(__m256i)(b), (int)(p), \ + (__mmask32)-1)) + +#define _mm256_mask_cmp_epu8_mask(m, a, b, p) \ + ((__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \ + (__v32qi)(__m256i)(b), (int)(p), \ + (__mmask32)(m))) + +#define _mm_cmp_epi16_mask(a, b, p) \ + ((__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \ + (__v8hi)(__m128i)(b), (int)(p), \ + (__mmask8)-1)) + +#define _mm_mask_cmp_epi16_mask(m, a, b, p) \ + ((__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \ + (__v8hi)(__m128i)(b), (int)(p), \ + (__mmask8)(m))) + +#define _mm_cmp_epu16_mask(a, b, p) \ + ((__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \ + (__v8hi)(__m128i)(b), (int)(p), \ + (__mmask8)-1)) + +#define _mm_mask_cmp_epu16_mask(m, a, b, p) \ + ((__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \ + (__v8hi)(__m128i)(b), (int)(p), \ + (__mmask8)(m))) + +#define _mm256_cmp_epi16_mask(a, b, p) \ + ((__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \ + (__v16hi)(__m256i)(b), (int)(p), \ + (__mmask16)-1)) + +#define _mm256_mask_cmp_epi16_mask(m, a, b, p) \ + ((__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \ + (__v16hi)(__m256i)(b), (int)(p), \ + (__mmask16)(m))) + +#define _mm256_cmp_epu16_mask(a, b, p) \ + ((__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \ + (__v16hi)(__m256i)(b), (int)(p), \ + (__mmask16)-1)) + +#define _mm256_mask_cmp_epu16_mask(m, a, b, p) \ + ((__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \ + (__v16hi)(__m256i)(b), (int)(p), \ + (__mmask16)(m))) + +#define _mm_cmpeq_epi8_mask(A, B) \ + _mm_cmp_epi8_mask((A), (B), _MM_CMPINT_EQ) +#define _mm_mask_cmpeq_epi8_mask(k, A, B) \ + _mm_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm_cmpge_epi8_mask(A, B) \ + _mm_cmp_epi8_mask((A), (B), _MM_CMPINT_GE) +#define _mm_mask_cmpge_epi8_mask(k, A, B) \ + _mm_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm_cmpgt_epi8_mask(A, B) \ + _mm_cmp_epi8_mask((A), (B), _MM_CMPINT_GT) +#define _mm_mask_cmpgt_epi8_mask(k, A, B) \ + _mm_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm_cmple_epi8_mask(A, B) \ + _mm_cmp_epi8_mask((A), (B), _MM_CMPINT_LE) +#define _mm_mask_cmple_epi8_mask(k, A, B) \ + _mm_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm_cmplt_epi8_mask(A, B) \ + _mm_cmp_epi8_mask((A), (B), _MM_CMPINT_LT) +#define _mm_mask_cmplt_epi8_mask(k, A, B) \ + _mm_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm_cmpneq_epi8_mask(A, B) \ + _mm_cmp_epi8_mask((A), (B), _MM_CMPINT_NE) +#define _mm_mask_cmpneq_epi8_mask(k, A, B) \ + _mm_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_NE) + +#define _mm256_cmpeq_epi8_mask(A, B) \ + _mm256_cmp_epi8_mask((A), (B), _MM_CMPINT_EQ) +#define _mm256_mask_cmpeq_epi8_mask(k, A, B) \ + _mm256_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm256_cmpge_epi8_mask(A, B) \ + _mm256_cmp_epi8_mask((A), (B), _MM_CMPINT_GE) +#define _mm256_mask_cmpge_epi8_mask(k, A, B) \ + _mm256_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm256_cmpgt_epi8_mask(A, B) \ + _mm256_cmp_epi8_mask((A), (B), _MM_CMPINT_GT) +#define _mm256_mask_cmpgt_epi8_mask(k, A, B) \ + _mm256_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm256_cmple_epi8_mask(A, B) \ + _mm256_cmp_epi8_mask((A), (B), _MM_CMPINT_LE) +#define _mm256_mask_cmple_epi8_mask(k, A, B) \ + _mm256_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm256_cmplt_epi8_mask(A, B) \ + _mm256_cmp_epi8_mask((A), (B), _MM_CMPINT_LT) +#define _mm256_mask_cmplt_epi8_mask(k, A, B) \ + _mm256_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm256_cmpneq_epi8_mask(A, B) \ + _mm256_cmp_epi8_mask((A), (B), _MM_CMPINT_NE) +#define _mm256_mask_cmpneq_epi8_mask(k, A, B) \ + _mm256_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_NE) + +#define _mm_cmpeq_epu8_mask(A, B) \ + _mm_cmp_epu8_mask((A), (B), _MM_CMPINT_EQ) +#define _mm_mask_cmpeq_epu8_mask(k, A, B) \ + _mm_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm_cmpge_epu8_mask(A, B) \ + _mm_cmp_epu8_mask((A), (B), _MM_CMPINT_GE) +#define _mm_mask_cmpge_epu8_mask(k, A, B) \ + _mm_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm_cmpgt_epu8_mask(A, B) \ + _mm_cmp_epu8_mask((A), (B), _MM_CMPINT_GT) +#define _mm_mask_cmpgt_epu8_mask(k, A, B) \ + _mm_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm_cmple_epu8_mask(A, B) \ + _mm_cmp_epu8_mask((A), (B), _MM_CMPINT_LE) +#define _mm_mask_cmple_epu8_mask(k, A, B) \ + _mm_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm_cmplt_epu8_mask(A, B) \ + _mm_cmp_epu8_mask((A), (B), _MM_CMPINT_LT) +#define _mm_mask_cmplt_epu8_mask(k, A, B) \ + _mm_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm_cmpneq_epu8_mask(A, B) \ + _mm_cmp_epu8_mask((A), (B), _MM_CMPINT_NE) +#define _mm_mask_cmpneq_epu8_mask(k, A, B) \ + _mm_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_NE) + +#define _mm256_cmpeq_epu8_mask(A, B) \ + _mm256_cmp_epu8_mask((A), (B), _MM_CMPINT_EQ) +#define _mm256_mask_cmpeq_epu8_mask(k, A, B) \ + _mm256_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm256_cmpge_epu8_mask(A, B) \ + _mm256_cmp_epu8_mask((A), (B), _MM_CMPINT_GE) +#define _mm256_mask_cmpge_epu8_mask(k, A, B) \ + _mm256_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm256_cmpgt_epu8_mask(A, B) \ + _mm256_cmp_epu8_mask((A), (B), _MM_CMPINT_GT) +#define _mm256_mask_cmpgt_epu8_mask(k, A, B) \ + _mm256_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm256_cmple_epu8_mask(A, B) \ + _mm256_cmp_epu8_mask((A), (B), _MM_CMPINT_LE) +#define _mm256_mask_cmple_epu8_mask(k, A, B) \ + _mm256_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm256_cmplt_epu8_mask(A, B) \ + _mm256_cmp_epu8_mask((A), (B), _MM_CMPINT_LT) +#define _mm256_mask_cmplt_epu8_mask(k, A, B) \ + _mm256_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm256_cmpneq_epu8_mask(A, B) \ + _mm256_cmp_epu8_mask((A), (B), _MM_CMPINT_NE) +#define _mm256_mask_cmpneq_epu8_mask(k, A, B) \ + _mm256_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_NE) + +#define _mm_cmpeq_epi16_mask(A, B) \ + _mm_cmp_epi16_mask((A), (B), _MM_CMPINT_EQ) +#define _mm_mask_cmpeq_epi16_mask(k, A, B) \ + _mm_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm_cmpge_epi16_mask(A, B) \ + _mm_cmp_epi16_mask((A), (B), _MM_CMPINT_GE) +#define _mm_mask_cmpge_epi16_mask(k, A, B) \ + _mm_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm_cmpgt_epi16_mask(A, B) \ + _mm_cmp_epi16_mask((A), (B), _MM_CMPINT_GT) +#define _mm_mask_cmpgt_epi16_mask(k, A, B) \ + _mm_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm_cmple_epi16_mask(A, B) \ + _mm_cmp_epi16_mask((A), (B), _MM_CMPINT_LE) +#define _mm_mask_cmple_epi16_mask(k, A, B) \ + _mm_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm_cmplt_epi16_mask(A, B) \ + _mm_cmp_epi16_mask((A), (B), _MM_CMPINT_LT) +#define _mm_mask_cmplt_epi16_mask(k, A, B) \ + _mm_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm_cmpneq_epi16_mask(A, B) \ + _mm_cmp_epi16_mask((A), (B), _MM_CMPINT_NE) +#define _mm_mask_cmpneq_epi16_mask(k, A, B) \ + _mm_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_NE) + +#define _mm256_cmpeq_epi16_mask(A, B) \ + _mm256_cmp_epi16_mask((A), (B), _MM_CMPINT_EQ) +#define _mm256_mask_cmpeq_epi16_mask(k, A, B) \ + _mm256_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm256_cmpge_epi16_mask(A, B) \ + _mm256_cmp_epi16_mask((A), (B), _MM_CMPINT_GE) +#define _mm256_mask_cmpge_epi16_mask(k, A, B) \ + _mm256_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm256_cmpgt_epi16_mask(A, B) \ + _mm256_cmp_epi16_mask((A), (B), _MM_CMPINT_GT) +#define _mm256_mask_cmpgt_epi16_mask(k, A, B) \ + _mm256_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm256_cmple_epi16_mask(A, B) \ + _mm256_cmp_epi16_mask((A), (B), _MM_CMPINT_LE) +#define _mm256_mask_cmple_epi16_mask(k, A, B) \ + _mm256_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm256_cmplt_epi16_mask(A, B) \ + _mm256_cmp_epi16_mask((A), (B), _MM_CMPINT_LT) +#define _mm256_mask_cmplt_epi16_mask(k, A, B) \ + _mm256_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm256_cmpneq_epi16_mask(A, B) \ + _mm256_cmp_epi16_mask((A), (B), _MM_CMPINT_NE) +#define _mm256_mask_cmpneq_epi16_mask(k, A, B) \ + _mm256_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_NE) + +#define _mm_cmpeq_epu16_mask(A, B) \ + _mm_cmp_epu16_mask((A), (B), _MM_CMPINT_EQ) +#define _mm_mask_cmpeq_epu16_mask(k, A, B) \ + _mm_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm_cmpge_epu16_mask(A, B) \ + _mm_cmp_epu16_mask((A), (B), _MM_CMPINT_GE) +#define _mm_mask_cmpge_epu16_mask(k, A, B) \ + _mm_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm_cmpgt_epu16_mask(A, B) \ + _mm_cmp_epu16_mask((A), (B), _MM_CMPINT_GT) +#define _mm_mask_cmpgt_epu16_mask(k, A, B) \ + _mm_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm_cmple_epu16_mask(A, B) \ + _mm_cmp_epu16_mask((A), (B), _MM_CMPINT_LE) +#define _mm_mask_cmple_epu16_mask(k, A, B) \ + _mm_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm_cmplt_epu16_mask(A, B) \ + _mm_cmp_epu16_mask((A), (B), _MM_CMPINT_LT) +#define _mm_mask_cmplt_epu16_mask(k, A, B) \ + _mm_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm_cmpneq_epu16_mask(A, B) \ + _mm_cmp_epu16_mask((A), (B), _MM_CMPINT_NE) +#define _mm_mask_cmpneq_epu16_mask(k, A, B) \ + _mm_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_NE) + +#define _mm256_cmpeq_epu16_mask(A, B) \ + _mm256_cmp_epu16_mask((A), (B), _MM_CMPINT_EQ) +#define _mm256_mask_cmpeq_epu16_mask(k, A, B) \ + _mm256_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm256_cmpge_epu16_mask(A, B) \ + _mm256_cmp_epu16_mask((A), (B), _MM_CMPINT_GE) +#define _mm256_mask_cmpge_epu16_mask(k, A, B) \ + _mm256_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm256_cmpgt_epu16_mask(A, B) \ + _mm256_cmp_epu16_mask((A), (B), _MM_CMPINT_GT) +#define _mm256_mask_cmpgt_epu16_mask(k, A, B) \ + _mm256_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm256_cmple_epu16_mask(A, B) \ + _mm256_cmp_epu16_mask((A), (B), _MM_CMPINT_LE) +#define _mm256_mask_cmple_epu16_mask(k, A, B) \ + _mm256_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm256_cmplt_epu16_mask(A, B) \ + _mm256_cmp_epu16_mask((A), (B), _MM_CMPINT_LT) +#define _mm256_mask_cmplt_epu16_mask(k, A, B) \ + _mm256_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm256_cmpneq_epu16_mask(A, B) \ + _mm256_cmp_epu16_mask((A), (B), _MM_CMPINT_NE) +#define _mm256_mask_cmpneq_epu16_mask(k, A, B) \ + _mm256_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_NE) + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_add_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B){ + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, + (__v32qi)_mm256_add_epi8(__A, __B), + (__v32qi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_add_epi8(__mmask32 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, + (__v32qi)_mm256_add_epi8(__A, __B), + (__v32qi)_mm256_setzero_si256()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_add_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_add_epi16(__A, __B), + (__v16hi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_add_epi16(__mmask16 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_add_epi16(__A, __B), + (__v16hi)_mm256_setzero_si256()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_sub_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, + (__v32qi)_mm256_sub_epi8(__A, __B), + (__v32qi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_sub_epi8(__mmask32 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, + (__v32qi)_mm256_sub_epi8(__A, __B), + (__v32qi)_mm256_setzero_si256()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_sub_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_sub_epi16(__A, __B), + (__v16hi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_sub_epi16(__mmask16 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_sub_epi16(__A, __B), + (__v16hi)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_add_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, + (__v16qi)_mm_add_epi8(__A, __B), + (__v16qi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_add_epi8(__mmask16 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, + (__v16qi)_mm_add_epi8(__A, __B), + (__v16qi)_mm_setzero_si128()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_add_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_add_epi16(__A, __B), + (__v8hi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_add_epi16(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_add_epi16(__A, __B), + (__v8hi)_mm_setzero_si128()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_sub_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, + (__v16qi)_mm_sub_epi8(__A, __B), + (__v16qi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_sub_epi8(__mmask16 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, + (__v16qi)_mm_sub_epi8(__A, __B), + (__v16qi)_mm_setzero_si128()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_sub_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_sub_epi16(__A, __B), + (__v8hi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_sub_epi16(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_sub_epi16(__A, __B), + (__v8hi)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_mullo_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_mullo_epi16(__A, __B), + (__v16hi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_mullo_epi16(__mmask16 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_mullo_epi16(__A, __B), + (__v16hi)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_mullo_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_mullo_epi16(__A, __B), + (__v8hi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_mullo_epi16(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_mullo_epi16(__A, __B), + (__v8hi)_mm_setzero_si128()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_blend_epi8 (__mmask16 __U, __m128i __A, __m128i __W) +{ + return (__m128i) __builtin_ia32_selectb_128 ((__mmask16) __U, + (__v16qi) __W, + (__v16qi) __A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_blend_epi8 (__mmask32 __U, __m256i __A, __m256i __W) +{ + return (__m256i) __builtin_ia32_selectb_256 ((__mmask32) __U, + (__v32qi) __W, + (__v32qi) __A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_blend_epi16 (__mmask8 __U, __m128i __A, __m128i __W) +{ + return (__m128i) __builtin_ia32_selectw_128 ((__mmask8) __U, + (__v8hi) __W, + (__v8hi) __A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_blend_epi16 (__mmask16 __U, __m256i __A, __m256i __W) +{ + return (__m256i) __builtin_ia32_selectw_256 ((__mmask16) __U, + (__v16hi) __W, + (__v16hi) __A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_abs_epi8(__m128i __W, __mmask16 __U, __m128i __A) +{ + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, + (__v16qi)_mm_abs_epi8(__A), + (__v16qi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_abs_epi8(__mmask16 __U, __m128i __A) +{ + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, + (__v16qi)_mm_abs_epi8(__A), + (__v16qi)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_abs_epi8(__m256i __W, __mmask32 __U, __m256i __A) +{ + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, + (__v32qi)_mm256_abs_epi8(__A), + (__v32qi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_abs_epi8 (__mmask32 __U, __m256i __A) +{ + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, + (__v32qi)_mm256_abs_epi8(__A), + (__v32qi)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_abs_epi16(__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_abs_epi16(__A), + (__v8hi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_abs_epi16(__mmask8 __U, __m128i __A) +{ + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_abs_epi16(__A), + (__v8hi)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_abs_epi16(__m256i __W, __mmask16 __U, __m256i __A) +{ + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_abs_epi16(__A), + (__v16hi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_abs_epi16(__mmask16 __U, __m256i __A) +{ + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_abs_epi16(__A), + (__v16hi)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_packs_epi32(__mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, + (__v8hi)_mm_packs_epi32(__A, __B), + (__v8hi)_mm_setzero_si128()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_packs_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, + (__v8hi)_mm_packs_epi32(__A, __B), + (__v8hi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_packs_epi32(__mmask16 __M, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, + (__v16hi)_mm256_packs_epi32(__A, __B), + (__v16hi)_mm256_setzero_si256()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_packs_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, + (__v16hi)_mm256_packs_epi32(__A, __B), + (__v16hi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_packs_epi16(__mmask16 __M, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, + (__v16qi)_mm_packs_epi16(__A, __B), + (__v16qi)_mm_setzero_si128()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_packs_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, + (__v16qi)_mm_packs_epi16(__A, __B), + (__v16qi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_packs_epi16(__mmask32 __M, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, + (__v32qi)_mm256_packs_epi16(__A, __B), + (__v32qi)_mm256_setzero_si256()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_packs_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, + (__v32qi)_mm256_packs_epi16(__A, __B), + (__v32qi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_packus_epi32(__mmask8 __M, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, + (__v8hi)_mm_packus_epi32(__A, __B), + (__v8hi)_mm_setzero_si128()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_packus_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, + (__v8hi)_mm_packus_epi32(__A, __B), + (__v8hi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_packus_epi32(__mmask16 __M, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, + (__v16hi)_mm256_packus_epi32(__A, __B), + (__v16hi)_mm256_setzero_si256()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_packus_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, + (__v16hi)_mm256_packus_epi32(__A, __B), + (__v16hi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_packus_epi16(__mmask16 __M, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, + (__v16qi)_mm_packus_epi16(__A, __B), + (__v16qi)_mm_setzero_si128()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_packus_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, + (__v16qi)_mm_packus_epi16(__A, __B), + (__v16qi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_packus_epi16(__mmask32 __M, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, + (__v32qi)_mm256_packus_epi16(__A, __B), + (__v32qi)_mm256_setzero_si256()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_packus_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, + (__v32qi)_mm256_packus_epi16(__A, __B), + (__v32qi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_adds_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, + (__v16qi)_mm_adds_epi8(__A, __B), + (__v16qi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_adds_epi8(__mmask16 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, + (__v16qi)_mm_adds_epi8(__A, __B), + (__v16qi)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_adds_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, + (__v32qi)_mm256_adds_epi8(__A, __B), + (__v32qi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_adds_epi8(__mmask32 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, + (__v32qi)_mm256_adds_epi8(__A, __B), + (__v32qi)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_adds_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_adds_epi16(__A, __B), + (__v8hi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_adds_epi16(__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_adds_epi16(__A, __B), + (__v8hi)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_adds_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_adds_epi16(__A, __B), + (__v16hi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_adds_epi16(__mmask16 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_adds_epi16(__A, __B), + (__v16hi)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_adds_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, + (__v16qi)_mm_adds_epu8(__A, __B), + (__v16qi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_adds_epu8(__mmask16 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, + (__v16qi)_mm_adds_epu8(__A, __B), + (__v16qi)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_adds_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, + (__v32qi)_mm256_adds_epu8(__A, __B), + (__v32qi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_adds_epu8(__mmask32 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, + (__v32qi)_mm256_adds_epu8(__A, __B), + (__v32qi)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_adds_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_adds_epu16(__A, __B), + (__v8hi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_adds_epu16(__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_adds_epu16(__A, __B), + (__v8hi)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_adds_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_adds_epu16(__A, __B), + (__v16hi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_adds_epu16(__mmask16 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_adds_epu16(__A, __B), + (__v16hi)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_avg_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, + (__v16qi)_mm_avg_epu8(__A, __B), + (__v16qi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_avg_epu8(__mmask16 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, + (__v16qi)_mm_avg_epu8(__A, __B), + (__v16qi)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_avg_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, + (__v32qi)_mm256_avg_epu8(__A, __B), + (__v32qi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_avg_epu8(__mmask32 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, + (__v32qi)_mm256_avg_epu8(__A, __B), + (__v32qi)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_avg_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_avg_epu16(__A, __B), + (__v8hi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_avg_epu16(__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_avg_epu16(__A, __B), + (__v8hi)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_avg_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_avg_epu16(__A, __B), + (__v16hi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_avg_epu16(__mmask16 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_avg_epu16(__A, __B), + (__v16hi)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_max_epi8(__mmask16 __M, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, + (__v16qi)_mm_max_epi8(__A, __B), + (__v16qi)_mm_setzero_si128()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_max_epi8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, + (__v16qi)_mm_max_epi8(__A, __B), + (__v16qi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_max_epi8(__mmask32 __M, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, + (__v32qi)_mm256_max_epi8(__A, __B), + (__v32qi)_mm256_setzero_si256()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_max_epi8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, + (__v32qi)_mm256_max_epi8(__A, __B), + (__v32qi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_max_epi16(__mmask8 __M, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, + (__v8hi)_mm_max_epi16(__A, __B), + (__v8hi)_mm_setzero_si128()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_max_epi16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, + (__v8hi)_mm_max_epi16(__A, __B), + (__v8hi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_max_epi16(__mmask16 __M, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, + (__v16hi)_mm256_max_epi16(__A, __B), + (__v16hi)_mm256_setzero_si256()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_max_epi16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, + (__v16hi)_mm256_max_epi16(__A, __B), + (__v16hi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_max_epu8(__mmask16 __M, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, + (__v16qi)_mm_max_epu8(__A, __B), + (__v16qi)_mm_setzero_si128()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_max_epu8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, + (__v16qi)_mm_max_epu8(__A, __B), + (__v16qi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_max_epu8 (__mmask32 __M, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, + (__v32qi)_mm256_max_epu8(__A, __B), + (__v32qi)_mm256_setzero_si256()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_max_epu8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, + (__v32qi)_mm256_max_epu8(__A, __B), + (__v32qi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_max_epu16(__mmask8 __M, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, + (__v8hi)_mm_max_epu16(__A, __B), + (__v8hi)_mm_setzero_si128()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_max_epu16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, + (__v8hi)_mm_max_epu16(__A, __B), + (__v8hi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_max_epu16(__mmask16 __M, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, + (__v16hi)_mm256_max_epu16(__A, __B), + (__v16hi)_mm256_setzero_si256()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_max_epu16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, + (__v16hi)_mm256_max_epu16(__A, __B), + (__v16hi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_min_epi8(__mmask16 __M, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, + (__v16qi)_mm_min_epi8(__A, __B), + (__v16qi)_mm_setzero_si128()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_min_epi8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, + (__v16qi)_mm_min_epi8(__A, __B), + (__v16qi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_min_epi8(__mmask32 __M, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, + (__v32qi)_mm256_min_epi8(__A, __B), + (__v32qi)_mm256_setzero_si256()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_min_epi8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, + (__v32qi)_mm256_min_epi8(__A, __B), + (__v32qi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_min_epi16(__mmask8 __M, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, + (__v8hi)_mm_min_epi16(__A, __B), + (__v8hi)_mm_setzero_si128()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_min_epi16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, + (__v8hi)_mm_min_epi16(__A, __B), + (__v8hi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_min_epi16(__mmask16 __M, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, + (__v16hi)_mm256_min_epi16(__A, __B), + (__v16hi)_mm256_setzero_si256()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_min_epi16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, + (__v16hi)_mm256_min_epi16(__A, __B), + (__v16hi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_min_epu8(__mmask16 __M, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, + (__v16qi)_mm_min_epu8(__A, __B), + (__v16qi)_mm_setzero_si128()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_min_epu8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, + (__v16qi)_mm_min_epu8(__A, __B), + (__v16qi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_min_epu8 (__mmask32 __M, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, + (__v32qi)_mm256_min_epu8(__A, __B), + (__v32qi)_mm256_setzero_si256()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_min_epu8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, + (__v32qi)_mm256_min_epu8(__A, __B), + (__v32qi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_min_epu16(__mmask8 __M, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, + (__v8hi)_mm_min_epu16(__A, __B), + (__v8hi)_mm_setzero_si128()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_min_epu16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, + (__v8hi)_mm_min_epu16(__A, __B), + (__v8hi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_min_epu16(__mmask16 __M, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, + (__v16hi)_mm256_min_epu16(__A, __B), + (__v16hi)_mm256_setzero_si256()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_min_epu16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, + (__v16hi)_mm256_min_epu16(__A, __B), + (__v16hi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_shuffle_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, + (__v16qi)_mm_shuffle_epi8(__A, __B), + (__v16qi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_shuffle_epi8(__mmask16 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, + (__v16qi)_mm_shuffle_epi8(__A, __B), + (__v16qi)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_shuffle_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, + (__v32qi)_mm256_shuffle_epi8(__A, __B), + (__v32qi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_shuffle_epi8(__mmask32 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, + (__v32qi)_mm256_shuffle_epi8(__A, __B), + (__v32qi)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_subs_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, + (__v16qi)_mm_subs_epi8(__A, __B), + (__v16qi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_subs_epi8(__mmask16 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, + (__v16qi)_mm_subs_epi8(__A, __B), + (__v16qi)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_subs_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, + (__v32qi)_mm256_subs_epi8(__A, __B), + (__v32qi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_subs_epi8(__mmask32 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, + (__v32qi)_mm256_subs_epi8(__A, __B), + (__v32qi)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_subs_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_subs_epi16(__A, __B), + (__v8hi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_subs_epi16(__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_subs_epi16(__A, __B), + (__v8hi)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_subs_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_subs_epi16(__A, __B), + (__v16hi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_subs_epi16(__mmask16 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_subs_epi16(__A, __B), + (__v16hi)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_subs_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, + (__v16qi)_mm_subs_epu8(__A, __B), + (__v16qi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_subs_epu8(__mmask16 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, + (__v16qi)_mm_subs_epu8(__A, __B), + (__v16qi)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_subs_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, + (__v32qi)_mm256_subs_epu8(__A, __B), + (__v32qi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_subs_epu8(__mmask32 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, + (__v32qi)_mm256_subs_epu8(__A, __B), + (__v32qi)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_subs_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_subs_epu16(__A, __B), + (__v8hi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_subs_epu16(__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_subs_epu16(__A, __B), + (__v8hi)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_subs_epu16(__m256i __W, __mmask16 __U, __m256i __A, + __m256i __B) { + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_subs_epu16(__A, __B), + (__v16hi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_subs_epu16(__mmask16 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_subs_epu16(__A, __B), + (__v16hi)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_permutex2var_epi16(__m128i __A, __m128i __I, __m128i __B) +{ + return (__m128i)__builtin_ia32_vpermi2varhi128((__v8hi)__A, (__v8hi)__I, + (__v8hi) __B); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_permutex2var_epi16(__m128i __A, __mmask8 __U, __m128i __I, + __m128i __B) +{ + return (__m128i)__builtin_ia32_selectw_128(__U, + (__v8hi)_mm_permutex2var_epi16(__A, __I, __B), + (__v8hi)__A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask2_permutex2var_epi16(__m128i __A, __m128i __I, __mmask8 __U, + __m128i __B) +{ + return (__m128i)__builtin_ia32_selectw_128(__U, + (__v8hi)_mm_permutex2var_epi16(__A, __I, __B), + (__v8hi)__I); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_permutex2var_epi16 (__mmask8 __U, __m128i __A, __m128i __I, + __m128i __B) +{ + return (__m128i)__builtin_ia32_selectw_128(__U, + (__v8hi)_mm_permutex2var_epi16(__A, __I, __B), + (__v8hi)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_permutex2var_epi16(__m256i __A, __m256i __I, __m256i __B) +{ + return (__m256i)__builtin_ia32_vpermi2varhi256((__v16hi)__A, (__v16hi)__I, + (__v16hi)__B); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_permutex2var_epi16(__m256i __A, __mmask16 __U, __m256i __I, + __m256i __B) +{ + return (__m256i)__builtin_ia32_selectw_256(__U, + (__v16hi)_mm256_permutex2var_epi16(__A, __I, __B), + (__v16hi)__A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask2_permutex2var_epi16(__m256i __A, __m256i __I, __mmask16 __U, + __m256i __B) +{ + return (__m256i)__builtin_ia32_selectw_256(__U, + (__v16hi)_mm256_permutex2var_epi16(__A, __I, __B), + (__v16hi)__I); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_permutex2var_epi16 (__mmask16 __U, __m256i __A, __m256i __I, + __m256i __B) +{ + return (__m256i)__builtin_ia32_selectw_256(__U, + (__v16hi)_mm256_permutex2var_epi16(__A, __I, __B), + (__v16hi)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_maddubs_epi16(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_maddubs_epi16(__X, __Y), + (__v8hi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_maddubs_epi16(__mmask8 __U, __m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_maddubs_epi16(__X, __Y), + (__v8hi)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_maddubs_epi16(__m256i __W, __mmask16 __U, __m256i __X, + __m256i __Y) { + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_maddubs_epi16(__X, __Y), + (__v16hi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_maddubs_epi16(__mmask16 __U, __m256i __X, __m256i __Y) { + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_maddubs_epi16(__X, __Y), + (__v16hi)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_madd_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_madd_epi16(__A, __B), + (__v4si)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_madd_epi16(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_madd_epi16(__A, __B), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_madd_epi16(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_madd_epi16(__A, __B), + (__v8si)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_madd_epi16(__mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_madd_epi16(__A, __B), + (__v8si)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_cvtsepi16_epi8 (__m128i __A) { + return (__m128i) __builtin_ia32_pmovswb128_mask ((__v8hi) __A, + (__v16qi) _mm_setzero_si128(), + (__mmask8) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvtsepi16_epi8 (__m128i __O, __mmask8 __M, __m128i __A) { + return (__m128i) __builtin_ia32_pmovswb128_mask ((__v8hi) __A, + (__v16qi) __O, + __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtsepi16_epi8 (__mmask8 __M, __m128i __A) { + return (__m128i) __builtin_ia32_pmovswb128_mask ((__v8hi) __A, + (__v16qi) _mm_setzero_si128(), + __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_cvtsepi16_epi8 (__m256i __A) { + return (__m128i) __builtin_ia32_pmovswb256_mask ((__v16hi) __A, + (__v16qi) _mm_setzero_si128(), + (__mmask16) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtsepi16_epi8 (__m128i __O, __mmask16 __M, __m256i __A) { + return (__m128i) __builtin_ia32_pmovswb256_mask ((__v16hi) __A, + (__v16qi) __O, + __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtsepi16_epi8 (__mmask16 __M, __m256i __A) { + return (__m128i) __builtin_ia32_pmovswb256_mask ((__v16hi) __A, + (__v16qi) _mm_setzero_si128(), + __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_cvtusepi16_epi8 (__m128i __A) { + return (__m128i) __builtin_ia32_pmovuswb128_mask ((__v8hi) __A, + (__v16qi) _mm_setzero_si128(), + (__mmask8) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvtusepi16_epi8 (__m128i __O, __mmask8 __M, __m128i __A) { + return (__m128i) __builtin_ia32_pmovuswb128_mask ((__v8hi) __A, + (__v16qi) __O, + __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtusepi16_epi8 (__mmask8 __M, __m128i __A) { + return (__m128i) __builtin_ia32_pmovuswb128_mask ((__v8hi) __A, + (__v16qi) _mm_setzero_si128(), + __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_cvtusepi16_epi8 (__m256i __A) { + return (__m128i) __builtin_ia32_pmovuswb256_mask ((__v16hi) __A, + (__v16qi) _mm_setzero_si128(), + (__mmask16) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtusepi16_epi8 (__m128i __O, __mmask16 __M, __m256i __A) { + return (__m128i) __builtin_ia32_pmovuswb256_mask ((__v16hi) __A, + (__v16qi) __O, + __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtusepi16_epi8 (__mmask16 __M, __m256i __A) { + return (__m128i) __builtin_ia32_pmovuswb256_mask ((__v16hi) __A, + (__v16qi) _mm_setzero_si128(), + __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_cvtepi16_epi8 (__m128i __A) { + return (__m128i)__builtin_shufflevector( + __builtin_convertvector((__v8hi)__A, __v8qi), + (__v8qi){0, 0, 0, 0, 0, 0, 0, 0}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvtepi16_epi8 (__m128i __O, __mmask8 __M, __m128i __A) { + return (__m128i) __builtin_ia32_pmovwb128_mask ((__v8hi) __A, + (__v16qi) __O, + __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtepi16_epi8 (__mmask8 __M, __m128i __A) { + return (__m128i) __builtin_ia32_pmovwb128_mask ((__v8hi) __A, + (__v16qi) _mm_setzero_si128(), + __M); +} + +static __inline__ void __DEFAULT_FN_ATTRS128 +_mm_mask_cvtepi16_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A) +{ + __builtin_ia32_pmovwb128mem_mask ((__v16qi *) __P, (__v8hi) __A, __M); +} + + +static __inline__ void __DEFAULT_FN_ATTRS128 +_mm_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A) +{ + __builtin_ia32_pmovswb128mem_mask ((__v16qi *) __P, (__v8hi) __A, __M); +} + +static __inline__ void __DEFAULT_FN_ATTRS128 +_mm_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A) +{ + __builtin_ia32_pmovuswb128mem_mask ((__v16qi *) __P, (__v8hi) __A, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_cvtepi16_epi8 (__m256i __A) { + return (__m128i)__builtin_convertvector((__v16hi) __A, __v16qi); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtepi16_epi8 (__m128i __O, __mmask16 __M, __m256i __A) { + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, + (__v16qi)_mm256_cvtepi16_epi8(__A), + (__v16qi)__O); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtepi16_epi8 (__mmask16 __M, __m256i __A) { + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, + (__v16qi)_mm256_cvtepi16_epi8(__A), + (__v16qi)_mm_setzero_si128()); +} + +static __inline__ void __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtepi16_storeu_epi8 (void * __P, __mmask16 __M, __m256i __A) +{ + __builtin_ia32_pmovwb256mem_mask ((__v16qi *) __P, (__v16hi) __A, __M); +} + +static __inline__ void __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask16 __M, __m256i __A) +{ + __builtin_ia32_pmovswb256mem_mask ((__v16qi *) __P, (__v16hi) __A, __M); +} + +static __inline__ void __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask16 __M, __m256i __A) +{ + __builtin_ia32_pmovuswb256mem_mask ((__v16qi*) __P, (__v16hi) __A, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_mulhrs_epi16(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_mulhrs_epi16(__X, __Y), + (__v8hi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_mulhrs_epi16(__mmask8 __U, __m128i __X, __m128i __Y) { + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_mulhrs_epi16(__X, __Y), + (__v8hi)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_mulhrs_epi16(__m256i __W, __mmask16 __U, __m256i __X, __m256i __Y) { + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_mulhrs_epi16(__X, __Y), + (__v16hi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_mulhrs_epi16(__mmask16 __U, __m256i __X, __m256i __Y) { + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_mulhrs_epi16(__X, __Y), + (__v16hi)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_mulhi_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_mulhi_epu16(__A, __B), + (__v8hi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_mulhi_epu16(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_mulhi_epu16(__A, __B), + (__v8hi)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_mulhi_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_mulhi_epu16(__A, __B), + (__v16hi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_mulhi_epu16(__mmask16 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_mulhi_epu16(__A, __B), + (__v16hi)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_mulhi_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_mulhi_epi16(__A, __B), + (__v8hi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_mulhi_epi16(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_mulhi_epi16(__A, __B), + (__v8hi)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_mulhi_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_mulhi_epi16(__A, __B), + (__v16hi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_mulhi_epi16(__mmask16 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_mulhi_epi16(__A, __B), + (__v16hi)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_unpackhi_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, + (__v16qi)_mm_unpackhi_epi8(__A, __B), + (__v16qi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_unpackhi_epi8(__mmask16 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, + (__v16qi)_mm_unpackhi_epi8(__A, __B), + (__v16qi)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_unpackhi_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, + (__v32qi)_mm256_unpackhi_epi8(__A, __B), + (__v32qi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_unpackhi_epi8(__mmask32 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, + (__v32qi)_mm256_unpackhi_epi8(__A, __B), + (__v32qi)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_unpackhi_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_unpackhi_epi16(__A, __B), + (__v8hi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_unpackhi_epi16(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_unpackhi_epi16(__A, __B), + (__v8hi) _mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_unpackhi_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_unpackhi_epi16(__A, __B), + (__v16hi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_unpackhi_epi16(__mmask16 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_unpackhi_epi16(__A, __B), + (__v16hi)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_unpacklo_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, + (__v16qi)_mm_unpacklo_epi8(__A, __B), + (__v16qi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_unpacklo_epi8(__mmask16 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, + (__v16qi)_mm_unpacklo_epi8(__A, __B), + (__v16qi)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_unpacklo_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, + (__v32qi)_mm256_unpacklo_epi8(__A, __B), + (__v32qi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_unpacklo_epi8(__mmask32 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, + (__v32qi)_mm256_unpacklo_epi8(__A, __B), + (__v32qi)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_unpacklo_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_unpacklo_epi16(__A, __B), + (__v8hi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_unpacklo_epi16(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_unpacklo_epi16(__A, __B), + (__v8hi) _mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_unpacklo_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_unpacklo_epi16(__A, __B), + (__v16hi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_unpacklo_epi16(__mmask16 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_unpacklo_epi16(__A, __B), + (__v16hi)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvtepi8_epi16(__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_cvtepi8_epi16(__A), + (__v8hi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtepi8_epi16(__mmask8 __U, __m128i __A) +{ + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_cvtepi8_epi16(__A), + (__v8hi)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtepi8_epi16(__m256i __W, __mmask16 __U, __m128i __A) +{ + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_cvtepi8_epi16(__A), + (__v16hi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtepi8_epi16(__mmask16 __U, __m128i __A) +{ + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_cvtepi8_epi16(__A), + (__v16hi)_mm256_setzero_si256()); +} + + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvtepu8_epi16(__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_cvtepu8_epi16(__A), + (__v8hi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtepu8_epi16(__mmask8 __U, __m128i __A) +{ + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_cvtepu8_epi16(__A), + (__v8hi)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtepu8_epi16(__m256i __W, __mmask16 __U, __m128i __A) +{ + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_cvtepu8_epi16(__A), + (__v16hi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtepu8_epi16 (__mmask16 __U, __m128i __A) +{ + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_cvtepu8_epi16(__A), + (__v16hi)_mm256_setzero_si256()); +} + + +#define _mm_mask_shufflehi_epi16(W, U, A, imm) \ + ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ + (__v8hi)_mm_shufflehi_epi16((A), (imm)), \ + (__v8hi)(__m128i)(W))) + +#define _mm_maskz_shufflehi_epi16(U, A, imm) \ + ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ + (__v8hi)_mm_shufflehi_epi16((A), (imm)), \ + (__v8hi)_mm_setzero_si128())) + +#define _mm256_mask_shufflehi_epi16(W, U, A, imm) \ + ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ + (__v16hi)_mm256_shufflehi_epi16((A), (imm)), \ + (__v16hi)(__m256i)(W))) + +#define _mm256_maskz_shufflehi_epi16(U, A, imm) \ + ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ + (__v16hi)_mm256_shufflehi_epi16((A), (imm)), \ + (__v16hi)_mm256_setzero_si256())) + +#define _mm_mask_shufflelo_epi16(W, U, A, imm) \ + ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ + (__v8hi)_mm_shufflelo_epi16((A), (imm)), \ + (__v8hi)(__m128i)(W))) + +#define _mm_maskz_shufflelo_epi16(U, A, imm) \ + ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ + (__v8hi)_mm_shufflelo_epi16((A), (imm)), \ + (__v8hi)_mm_setzero_si128())) + +#define _mm256_mask_shufflelo_epi16(W, U, A, imm) \ + ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ + (__v16hi)_mm256_shufflelo_epi16((A), \ + (imm)), \ + (__v16hi)(__m256i)(W))) + +#define _mm256_maskz_shufflelo_epi16(U, A, imm) \ + ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ + (__v16hi)_mm256_shufflelo_epi16((A), \ + (imm)), \ + (__v16hi)_mm256_setzero_si256())) + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_sllv_epi16(__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_psllv16hi((__v16hi)__A, (__v16hi)__B); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_sllv_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_sllv_epi16(__A, __B), + (__v16hi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_sllv_epi16(__mmask16 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_sllv_epi16(__A, __B), + (__v16hi)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_sllv_epi16(__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_psllv8hi((__v8hi)__A, (__v8hi)__B); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_sllv_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_sllv_epi16(__A, __B), + (__v8hi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_sllv_epi16(__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_sllv_epi16(__A, __B), + (__v8hi)_mm_setzero_si128()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_sll_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_sll_epi16(__A, __B), + (__v8hi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_sll_epi16 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_sll_epi16(__A, __B), + (__v8hi)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_sll_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m128i __B) +{ + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_sll_epi16(__A, __B), + (__v16hi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_sll_epi16(__mmask16 __U, __m256i __A, __m128i __B) +{ + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_sll_epi16(__A, __B), + (__v16hi)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_slli_epi16(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B) +{ + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_slli_epi16(__A, __B), + (__v8hi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_slli_epi16 (__mmask8 __U, __m128i __A, unsigned int __B) +{ + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_slli_epi16(__A, __B), + (__v8hi)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_slli_epi16(__m256i __W, __mmask16 __U, __m256i __A, + unsigned int __B) +{ + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_slli_epi16(__A, __B), + (__v16hi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_slli_epi16(__mmask16 __U, __m256i __A, unsigned int __B) +{ + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_slli_epi16(__A, __B), + (__v16hi)_mm256_setzero_si256()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_srlv_epi16(__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_psrlv16hi((__v16hi)__A, (__v16hi)__B); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_srlv_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_srlv_epi16(__A, __B), + (__v16hi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_srlv_epi16(__mmask16 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_srlv_epi16(__A, __B), + (__v16hi)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_srlv_epi16(__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_psrlv8hi((__v8hi)__A, (__v8hi)__B); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_srlv_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_srlv_epi16(__A, __B), + (__v8hi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_srlv_epi16(__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_srlv_epi16(__A, __B), + (__v8hi)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_srav_epi16(__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_psrav16hi((__v16hi)__A, (__v16hi)__B); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_srav_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_srav_epi16(__A, __B), + (__v16hi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_srav_epi16(__mmask16 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_srav_epi16(__A, __B), + (__v16hi)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_srav_epi16(__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_psrav8hi((__v8hi)__A, (__v8hi)__B); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_srav_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_srav_epi16(__A, __B), + (__v8hi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_srav_epi16(__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_srav_epi16(__A, __B), + (__v8hi)_mm_setzero_si128()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_sra_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_sra_epi16(__A, __B), + (__v8hi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_sra_epi16(__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_sra_epi16(__A, __B), + (__v8hi)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_sra_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m128i __B) +{ + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_sra_epi16(__A, __B), + (__v16hi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_sra_epi16(__mmask16 __U, __m256i __A, __m128i __B) +{ + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_sra_epi16(__A, __B), + (__v16hi)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_srai_epi16(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B) +{ + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_srai_epi16(__A, __B), + (__v8hi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_srai_epi16(__mmask8 __U, __m128i __A, unsigned int __B) +{ + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_srai_epi16(__A, __B), + (__v8hi)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_srai_epi16(__m256i __W, __mmask16 __U, __m256i __A, + unsigned int __B) +{ + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_srai_epi16(__A, __B), + (__v16hi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_srai_epi16(__mmask16 __U, __m256i __A, unsigned int __B) +{ + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_srai_epi16(__A, __B), + (__v16hi)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_srl_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_srl_epi16(__A, __B), + (__v8hi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_srl_epi16 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_srl_epi16(__A, __B), + (__v8hi)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_srl_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m128i __B) +{ + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_srl_epi16(__A, __B), + (__v16hi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_srl_epi16(__mmask16 __U, __m256i __A, __m128i __B) +{ + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_srl_epi16(__A, __B), + (__v16hi)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_srli_epi16(__m128i __W, __mmask8 __U, __m128i __A, int __B) +{ + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_srli_epi16(__A, __B), + (__v8hi)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_srli_epi16 (__mmask8 __U, __m128i __A, int __B) +{ + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, + (__v8hi)_mm_srli_epi16(__A, __B), + (__v8hi)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_srli_epi16(__m256i __W, __mmask16 __U, __m256i __A, int __B) +{ + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_srli_epi16(__A, __B), + (__v16hi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_srli_epi16(__mmask16 __U, __m256i __A, int __B) +{ + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, + (__v16hi)_mm256_srli_epi16(__A, __B), + (__v16hi)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_mov_epi16 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_selectw_128 ((__mmask8) __U, + (__v8hi) __A, + (__v8hi) __W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_mov_epi16 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_selectw_128 ((__mmask8) __U, + (__v8hi) __A, + (__v8hi) _mm_setzero_si128 ()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_mov_epi16 (__m256i __W, __mmask16 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_selectw_256 ((__mmask16) __U, + (__v16hi) __A, + (__v16hi) __W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_mov_epi16 (__mmask16 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_selectw_256 ((__mmask16) __U, + (__v16hi) __A, + (__v16hi) _mm256_setzero_si256 ()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_mov_epi8 (__m128i __W, __mmask16 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_selectb_128 ((__mmask16) __U, + (__v16qi) __A, + (__v16qi) __W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_mov_epi8 (__mmask16 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_selectb_128 ((__mmask16) __U, + (__v16qi) __A, + (__v16qi) _mm_setzero_si128 ()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_mov_epi8 (__m256i __W, __mmask32 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_selectb_256 ((__mmask32) __U, + (__v32qi) __A, + (__v32qi) __W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_mov_epi8 (__mmask32 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_selectb_256 ((__mmask32) __U, + (__v32qi) __A, + (__v32qi) _mm256_setzero_si256 ()); +} + + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_set1_epi8 (__m128i __O, __mmask16 __M, char __A) +{ + return (__m128i) __builtin_ia32_selectb_128(__M, + (__v16qi) _mm_set1_epi8(__A), + (__v16qi) __O); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_set1_epi8 (__mmask16 __M, char __A) +{ + return (__m128i) __builtin_ia32_selectb_128(__M, + (__v16qi) _mm_set1_epi8(__A), + (__v16qi) _mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_set1_epi8 (__m256i __O, __mmask32 __M, char __A) +{ + return (__m256i) __builtin_ia32_selectb_256(__M, + (__v32qi) _mm256_set1_epi8(__A), + (__v32qi) __O); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_set1_epi8 (__mmask32 __M, char __A) +{ + return (__m256i) __builtin_ia32_selectb_256(__M, + (__v32qi) _mm256_set1_epi8(__A), + (__v32qi) _mm256_setzero_si256()); +} + +static __inline __m128i __DEFAULT_FN_ATTRS128 +_mm_loadu_epi16 (void const *__P) +{ + struct __loadu_epi16 { + __m128i_u __v; + } __attribute__((__packed__, __may_alias__)); + return ((const struct __loadu_epi16*)__P)->__v; +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_loadu_epi16 (__m128i __W, __mmask8 __U, void const *__P) +{ + return (__m128i) __builtin_ia32_loaddquhi128_mask ((const __v8hi *) __P, + (__v8hi) __W, + (__mmask8) __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_loadu_epi16 (__mmask8 __U, void const *__P) +{ + return (__m128i) __builtin_ia32_loaddquhi128_mask ((const __v8hi *) __P, + (__v8hi) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +static __inline __m256i __DEFAULT_FN_ATTRS256 +_mm256_loadu_epi16 (void const *__P) +{ + struct __loadu_epi16 { + __m256i_u __v; + } __attribute__((__packed__, __may_alias__)); + return ((const struct __loadu_epi16*)__P)->__v; +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_loadu_epi16 (__m256i __W, __mmask16 __U, void const *__P) +{ + return (__m256i) __builtin_ia32_loaddquhi256_mask ((const __v16hi *) __P, + (__v16hi) __W, + (__mmask16) __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_loadu_epi16 (__mmask16 __U, void const *__P) +{ + return (__m256i) __builtin_ia32_loaddquhi256_mask ((const __v16hi *) __P, + (__v16hi) + _mm256_setzero_si256 (), + (__mmask16) __U); +} + +static __inline __m128i __DEFAULT_FN_ATTRS128 +_mm_loadu_epi8 (void const *__P) +{ + struct __loadu_epi8 { + __m128i_u __v; + } __attribute__((__packed__, __may_alias__)); + return ((const struct __loadu_epi8*)__P)->__v; +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_loadu_epi8 (__m128i __W, __mmask16 __U, void const *__P) +{ + return (__m128i) __builtin_ia32_loaddquqi128_mask ((const __v16qi *) __P, + (__v16qi) __W, + (__mmask16) __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_loadu_epi8 (__mmask16 __U, void const *__P) +{ + return (__m128i) __builtin_ia32_loaddquqi128_mask ((const __v16qi *) __P, + (__v16qi) + _mm_setzero_si128 (), + (__mmask16) __U); +} + +static __inline __m256i __DEFAULT_FN_ATTRS256 +_mm256_loadu_epi8 (void const *__P) +{ + struct __loadu_epi8 { + __m256i_u __v; + } __attribute__((__packed__, __may_alias__)); + return ((const struct __loadu_epi8*)__P)->__v; +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_loadu_epi8 (__m256i __W, __mmask32 __U, void const *__P) +{ + return (__m256i) __builtin_ia32_loaddquqi256_mask ((const __v32qi *) __P, + (__v32qi) __W, + (__mmask32) __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_loadu_epi8 (__mmask32 __U, void const *__P) +{ + return (__m256i) __builtin_ia32_loaddquqi256_mask ((const __v32qi *) __P, + (__v32qi) + _mm256_setzero_si256 (), + (__mmask32) __U); +} + +static __inline void __DEFAULT_FN_ATTRS128 +_mm_storeu_epi16 (void *__P, __m128i __A) +{ + struct __storeu_epi16 { + __m128i_u __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_epi16*)__P)->__v = __A; +} + +static __inline__ void __DEFAULT_FN_ATTRS128 +_mm_mask_storeu_epi16 (void *__P, __mmask8 __U, __m128i __A) +{ + __builtin_ia32_storedquhi128_mask ((__v8hi *) __P, + (__v8hi) __A, + (__mmask8) __U); +} + +static __inline void __DEFAULT_FN_ATTRS256 +_mm256_storeu_epi16 (void *__P, __m256i __A) +{ + struct __storeu_epi16 { + __m256i_u __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_epi16*)__P)->__v = __A; +} + +static __inline__ void __DEFAULT_FN_ATTRS256 +_mm256_mask_storeu_epi16 (void *__P, __mmask16 __U, __m256i __A) +{ + __builtin_ia32_storedquhi256_mask ((__v16hi *) __P, + (__v16hi) __A, + (__mmask16) __U); +} + +static __inline void __DEFAULT_FN_ATTRS128 +_mm_storeu_epi8 (void *__P, __m128i __A) +{ + struct __storeu_epi8 { + __m128i_u __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_epi8*)__P)->__v = __A; +} + +static __inline__ void __DEFAULT_FN_ATTRS128 +_mm_mask_storeu_epi8 (void *__P, __mmask16 __U, __m128i __A) +{ + __builtin_ia32_storedquqi128_mask ((__v16qi *) __P, + (__v16qi) __A, + (__mmask16) __U); +} + +static __inline void __DEFAULT_FN_ATTRS256 +_mm256_storeu_epi8 (void *__P, __m256i __A) +{ + struct __storeu_epi8 { + __m256i_u __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_epi8*)__P)->__v = __A; +} + +static __inline__ void __DEFAULT_FN_ATTRS256 +_mm256_mask_storeu_epi8 (void *__P, __mmask32 __U, __m256i __A) +{ + __builtin_ia32_storedquqi256_mask ((__v32qi *) __P, + (__v32qi) __A, + (__mmask32) __U); +} + +static __inline__ __mmask16 __DEFAULT_FN_ATTRS128 +_mm_test_epi8_mask (__m128i __A, __m128i __B) +{ + return _mm_cmpneq_epi8_mask (_mm_and_si128(__A, __B), _mm_setzero_si128()); +} + +static __inline__ __mmask16 __DEFAULT_FN_ATTRS128 +_mm_mask_test_epi8_mask (__mmask16 __U, __m128i __A, __m128i __B) +{ + return _mm_mask_cmpneq_epi8_mask (__U, _mm_and_si128 (__A, __B), + _mm_setzero_si128()); +} + +static __inline__ __mmask32 __DEFAULT_FN_ATTRS256 +_mm256_test_epi8_mask (__m256i __A, __m256i __B) +{ + return _mm256_cmpneq_epi8_mask (_mm256_and_si256(__A, __B), + _mm256_setzero_si256()); +} + +static __inline__ __mmask32 __DEFAULT_FN_ATTRS256 +_mm256_mask_test_epi8_mask (__mmask32 __U, __m256i __A, __m256i __B) +{ + return _mm256_mask_cmpneq_epi8_mask (__U, _mm256_and_si256(__A, __B), + _mm256_setzero_si256()); +} + +static __inline__ __mmask8 __DEFAULT_FN_ATTRS128 +_mm_test_epi16_mask (__m128i __A, __m128i __B) +{ + return _mm_cmpneq_epi16_mask (_mm_and_si128 (__A, __B), _mm_setzero_si128()); +} + +static __inline__ __mmask8 __DEFAULT_FN_ATTRS128 +_mm_mask_test_epi16_mask (__mmask8 __U, __m128i __A, __m128i __B) +{ + return _mm_mask_cmpneq_epi16_mask (__U, _mm_and_si128 (__A, __B), + _mm_setzero_si128()); +} + +static __inline__ __mmask16 __DEFAULT_FN_ATTRS256 +_mm256_test_epi16_mask (__m256i __A, __m256i __B) +{ + return _mm256_cmpneq_epi16_mask (_mm256_and_si256 (__A, __B), + _mm256_setzero_si256 ()); +} + +static __inline__ __mmask16 __DEFAULT_FN_ATTRS256 +_mm256_mask_test_epi16_mask (__mmask16 __U, __m256i __A, __m256i __B) +{ + return _mm256_mask_cmpneq_epi16_mask (__U, _mm256_and_si256(__A, __B), + _mm256_setzero_si256()); +} + +static __inline__ __mmask16 __DEFAULT_FN_ATTRS128 +_mm_testn_epi8_mask (__m128i __A, __m128i __B) +{ + return _mm_cmpeq_epi8_mask (_mm_and_si128 (__A, __B), _mm_setzero_si128()); +} + +static __inline__ __mmask16 __DEFAULT_FN_ATTRS128 +_mm_mask_testn_epi8_mask (__mmask16 __U, __m128i __A, __m128i __B) +{ + return _mm_mask_cmpeq_epi8_mask (__U, _mm_and_si128 (__A, __B), + _mm_setzero_si128()); +} + +static __inline__ __mmask32 __DEFAULT_FN_ATTRS256 +_mm256_testn_epi8_mask (__m256i __A, __m256i __B) +{ + return _mm256_cmpeq_epi8_mask (_mm256_and_si256 (__A, __B), + _mm256_setzero_si256()); +} + +static __inline__ __mmask32 __DEFAULT_FN_ATTRS256 +_mm256_mask_testn_epi8_mask (__mmask32 __U, __m256i __A, __m256i __B) +{ + return _mm256_mask_cmpeq_epi8_mask (__U, _mm256_and_si256 (__A, __B), + _mm256_setzero_si256()); +} + +static __inline__ __mmask8 __DEFAULT_FN_ATTRS128 +_mm_testn_epi16_mask (__m128i __A, __m128i __B) +{ + return _mm_cmpeq_epi16_mask (_mm_and_si128 (__A, __B), _mm_setzero_si128()); +} + +static __inline__ __mmask8 __DEFAULT_FN_ATTRS128 +_mm_mask_testn_epi16_mask (__mmask8 __U, __m128i __A, __m128i __B) +{ + return _mm_mask_cmpeq_epi16_mask (__U, _mm_and_si128(__A, __B), _mm_setzero_si128()); +} + +static __inline__ __mmask16 __DEFAULT_FN_ATTRS256 +_mm256_testn_epi16_mask (__m256i __A, __m256i __B) +{ + return _mm256_cmpeq_epi16_mask (_mm256_and_si256(__A, __B), + _mm256_setzero_si256()); +} + +static __inline__ __mmask16 __DEFAULT_FN_ATTRS256 +_mm256_mask_testn_epi16_mask (__mmask16 __U, __m256i __A, __m256i __B) +{ + return _mm256_mask_cmpeq_epi16_mask (__U, _mm256_and_si256 (__A, __B), + _mm256_setzero_si256()); +} + +static __inline__ __mmask16 __DEFAULT_FN_ATTRS128 +_mm_movepi8_mask (__m128i __A) +{ + return (__mmask16) __builtin_ia32_cvtb2mask128 ((__v16qi) __A); +} + +static __inline__ __mmask32 __DEFAULT_FN_ATTRS256 +_mm256_movepi8_mask (__m256i __A) +{ + return (__mmask32) __builtin_ia32_cvtb2mask256 ((__v32qi) __A); +} + +static __inline__ __mmask8 __DEFAULT_FN_ATTRS128 +_mm_movepi16_mask (__m128i __A) +{ + return (__mmask8) __builtin_ia32_cvtw2mask128 ((__v8hi) __A); +} + +static __inline__ __mmask16 __DEFAULT_FN_ATTRS256 +_mm256_movepi16_mask (__m256i __A) +{ + return (__mmask16) __builtin_ia32_cvtw2mask256 ((__v16hi) __A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_movm_epi8 (__mmask16 __A) +{ + return (__m128i) __builtin_ia32_cvtmask2b128 (__A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_movm_epi8 (__mmask32 __A) +{ + return (__m256i) __builtin_ia32_cvtmask2b256 (__A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_movm_epi16 (__mmask8 __A) +{ + return (__m128i) __builtin_ia32_cvtmask2w128 (__A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_movm_epi16 (__mmask16 __A) +{ + return (__m256i) __builtin_ia32_cvtmask2w256 (__A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_broadcastb_epi8 (__m128i __O, __mmask16 __M, __m128i __A) +{ + return (__m128i)__builtin_ia32_selectb_128(__M, + (__v16qi) _mm_broadcastb_epi8(__A), + (__v16qi) __O); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_broadcastb_epi8 (__mmask16 __M, __m128i __A) +{ + return (__m128i)__builtin_ia32_selectb_128(__M, + (__v16qi) _mm_broadcastb_epi8(__A), + (__v16qi) _mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_broadcastb_epi8 (__m256i __O, __mmask32 __M, __m128i __A) +{ + return (__m256i)__builtin_ia32_selectb_256(__M, + (__v32qi) _mm256_broadcastb_epi8(__A), + (__v32qi) __O); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_broadcastb_epi8 (__mmask32 __M, __m128i __A) +{ + return (__m256i)__builtin_ia32_selectb_256(__M, + (__v32qi) _mm256_broadcastb_epi8(__A), + (__v32qi) _mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_broadcastw_epi16 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i)__builtin_ia32_selectw_128(__M, + (__v8hi) _mm_broadcastw_epi16(__A), + (__v8hi) __O); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_broadcastw_epi16 (__mmask8 __M, __m128i __A) +{ + return (__m128i)__builtin_ia32_selectw_128(__M, + (__v8hi) _mm_broadcastw_epi16(__A), + (__v8hi) _mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_broadcastw_epi16 (__m256i __O, __mmask16 __M, __m128i __A) +{ + return (__m256i)__builtin_ia32_selectw_256(__M, + (__v16hi) _mm256_broadcastw_epi16(__A), + (__v16hi) __O); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_broadcastw_epi16 (__mmask16 __M, __m128i __A) +{ + return (__m256i)__builtin_ia32_selectw_256(__M, + (__v16hi) _mm256_broadcastw_epi16(__A), + (__v16hi) _mm256_setzero_si256()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_set1_epi16 (__m256i __O, __mmask16 __M, short __A) +{ + return (__m256i) __builtin_ia32_selectw_256 (__M, + (__v16hi) _mm256_set1_epi16(__A), + (__v16hi) __O); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_set1_epi16 (__mmask16 __M, short __A) +{ + return (__m256i) __builtin_ia32_selectw_256(__M, + (__v16hi)_mm256_set1_epi16(__A), + (__v16hi) _mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_set1_epi16 (__m128i __O, __mmask8 __M, short __A) +{ + return (__m128i) __builtin_ia32_selectw_128(__M, + (__v8hi) _mm_set1_epi16(__A), + (__v8hi) __O); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_set1_epi16 (__mmask8 __M, short __A) +{ + return (__m128i) __builtin_ia32_selectw_128(__M, + (__v8hi) _mm_set1_epi16(__A), + (__v8hi) _mm_setzero_si128()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_permutexvar_epi16 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_permvarhi128((__v8hi) __B, (__v8hi) __A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_permutexvar_epi16 (__mmask8 __M, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, + (__v8hi)_mm_permutexvar_epi16(__A, __B), + (__v8hi) _mm_setzero_si128()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_permutexvar_epi16 (__m128i __W, __mmask8 __M, __m128i __A, + __m128i __B) +{ + return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, + (__v8hi)_mm_permutexvar_epi16(__A, __B), + (__v8hi)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_permutexvar_epi16 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_permvarhi256((__v16hi) __B, (__v16hi) __A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_permutexvar_epi16 (__mmask16 __M, __m256i __A, + __m256i __B) +{ + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, + (__v16hi)_mm256_permutexvar_epi16(__A, __B), + (__v16hi)_mm256_setzero_si256()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_permutexvar_epi16 (__m256i __W, __mmask16 __M, __m256i __A, + __m256i __B) +{ + return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, + (__v16hi)_mm256_permutexvar_epi16(__A, __B), + (__v16hi)__W); +} + +#define _mm_mask_alignr_epi8(W, U, A, B, N) \ + ((__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \ + (__v16qi)_mm_alignr_epi8((A), (B), (int)(N)), \ + (__v16qi)(__m128i)(W))) + +#define _mm_maskz_alignr_epi8(U, A, B, N) \ + ((__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \ + (__v16qi)_mm_alignr_epi8((A), (B), (int)(N)), \ + (__v16qi)_mm_setzero_si128())) + +#define _mm256_mask_alignr_epi8(W, U, A, B, N) \ + ((__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \ + (__v32qi)_mm256_alignr_epi8((A), (B), (int)(N)), \ + (__v32qi)(__m256i)(W))) + +#define _mm256_maskz_alignr_epi8(U, A, B, N) \ + ((__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \ + (__v32qi)_mm256_alignr_epi8((A), (B), (int)(N)), \ + (__v32qi)_mm256_setzero_si256())) + +#define _mm_dbsad_epu8(A, B, imm) \ + ((__m128i)__builtin_ia32_dbpsadbw128((__v16qi)(__m128i)(A), \ + (__v16qi)(__m128i)(B), (int)(imm))) + +#define _mm_mask_dbsad_epu8(W, U, A, B, imm) \ + ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ + (__v8hi)_mm_dbsad_epu8((A), (B), (imm)), \ + (__v8hi)(__m128i)(W))) + +#define _mm_maskz_dbsad_epu8(U, A, B, imm) \ + ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ + (__v8hi)_mm_dbsad_epu8((A), (B), (imm)), \ + (__v8hi)_mm_setzero_si128())) + +#define _mm256_dbsad_epu8(A, B, imm) \ + ((__m256i)__builtin_ia32_dbpsadbw256((__v32qi)(__m256i)(A), \ + (__v32qi)(__m256i)(B), (int)(imm))) + +#define _mm256_mask_dbsad_epu8(W, U, A, B, imm) \ + ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ + (__v16hi)_mm256_dbsad_epu8((A), (B), (imm)), \ + (__v16hi)(__m256i)(W))) + +#define _mm256_maskz_dbsad_epu8(U, A, B, imm) \ + ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ + (__v16hi)_mm256_dbsad_epu8((A), (B), (imm)), \ + (__v16hi)_mm256_setzero_si256())) + +#undef __DEFAULT_FN_ATTRS128 +#undef __DEFAULT_FN_ATTRS256 + +#endif /* __AVX512VLBWINTRIN_H */ diff --git a/include-llvm/avx512vlcdintrin.h b/include-llvm/avx512vlcdintrin.h new file mode 100644 index 0000000..cc8b725 --- /dev/null +++ b/include-llvm/avx512vlcdintrin.h @@ -0,0 +1,225 @@ +/*===---- avx512vlcdintrin.h - AVX512VL and AVX512CD intrinsics ------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __AVX512VLCDINTRIN_H +#define __AVX512VLCDINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512cd"), __min_vector_width__(128))) +#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512cd"), __min_vector_width__(256))) + + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_broadcastmb_epi64 (__mmask8 __A) +{ + return (__m128i) _mm_set1_epi64x((long long) __A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_broadcastmb_epi64 (__mmask8 __A) +{ + return (__m256i) _mm256_set1_epi64x((long long)__A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_broadcastmw_epi32 (__mmask16 __A) +{ + return (__m128i) _mm_set1_epi32((int)__A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_broadcastmw_epi32 (__mmask16 __A) +{ + return (__m256i) _mm256_set1_epi32((int)__A); +} + + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_conflict_epi64 (__m128i __A) +{ + return (__m128i) __builtin_ia32_vpconflictdi_128 ((__v2di) __A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_conflict_epi64 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_conflict_epi64(__A), + (__v2di)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_conflict_epi64 (__mmask8 __U, __m128i __A) +{ + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_conflict_epi64(__A), + (__v2di)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_conflict_epi64 (__m256i __A) +{ + return (__m256i) __builtin_ia32_vpconflictdi_256 ((__v4di) __A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_conflict_epi64 (__m256i __W, __mmask8 __U, __m256i __A) +{ + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_conflict_epi64(__A), + (__v4di)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_conflict_epi64 (__mmask8 __U, __m256i __A) +{ + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_conflict_epi64(__A), + (__v4di)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_conflict_epi32 (__m128i __A) +{ + return (__m128i) __builtin_ia32_vpconflictsi_128 ((__v4si) __A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_conflict_epi32 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_conflict_epi32(__A), + (__v4si)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_conflict_epi32 (__mmask8 __U, __m128i __A) +{ + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_conflict_epi32(__A), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_conflict_epi32 (__m256i __A) +{ + return (__m256i) __builtin_ia32_vpconflictsi_256 ((__v8si) __A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_conflict_epi32 (__m256i __W, __mmask8 __U, __m256i __A) +{ + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_conflict_epi32(__A), + (__v8si)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_conflict_epi32 (__mmask8 __U, __m256i __A) +{ + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_conflict_epi32(__A), + (__v8si)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_lzcnt_epi32 (__m128i __A) +{ + return (__m128i) __builtin_ia32_vplzcntd_128 ((__v4si) __A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_lzcnt_epi32 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_lzcnt_epi32(__A), + (__v4si)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_lzcnt_epi32 (__mmask8 __U, __m128i __A) +{ + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_lzcnt_epi32(__A), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_lzcnt_epi32 (__m256i __A) +{ + return (__m256i) __builtin_ia32_vplzcntd_256 ((__v8si) __A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_lzcnt_epi32 (__m256i __W, __mmask8 __U, __m256i __A) +{ + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_lzcnt_epi32(__A), + (__v8si)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_lzcnt_epi32 (__mmask8 __U, __m256i __A) +{ + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_lzcnt_epi32(__A), + (__v8si)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_lzcnt_epi64 (__m128i __A) +{ + return (__m128i) __builtin_ia32_vplzcntq_128 ((__v2di) __A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_lzcnt_epi64 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_lzcnt_epi64(__A), + (__v2di)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_lzcnt_epi64 (__mmask8 __U, __m128i __A) +{ + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_lzcnt_epi64(__A), + (__v2di)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_lzcnt_epi64 (__m256i __A) +{ + return (__m256i) __builtin_ia32_vplzcntq_256 ((__v4di) __A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_lzcnt_epi64 (__m256i __W, __mmask8 __U, __m256i __A) +{ + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_lzcnt_epi64(__A), + (__v4di)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_lzcnt_epi64 (__mmask8 __U, __m256i __A) +{ + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_lzcnt_epi64(__A), + (__v4di)_mm256_setzero_si256()); +} + +#undef __DEFAULT_FN_ATTRS128 +#undef __DEFAULT_FN_ATTRS256 + +#endif /* __AVX512VLCDINTRIN_H */ diff --git a/include-llvm/avx512vldqintrin.h b/include-llvm/avx512vldqintrin.h new file mode 100644 index 0000000..713e1a1 --- /dev/null +++ b/include-llvm/avx512vldqintrin.h @@ -0,0 +1,1167 @@ +/*===---- avx512vldqintrin.h - AVX512VL and AVX512DQ intrinsics ------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __AVX512VLDQINTRIN_H +#define __AVX512VLDQINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512dq"), __min_vector_width__(128))) +#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512dq"), __min_vector_width__(256))) + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mullo_epi64 (__m256i __A, __m256i __B) { + return (__m256i) ((__v4du) __A * (__v4du) __B); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_mullo_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_mullo_epi64(__A, __B), + (__v4di)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_mullo_epi64(__mmask8 __U, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_mullo_epi64(__A, __B), + (__v4di)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mullo_epi64 (__m128i __A, __m128i __B) { + return (__m128i) ((__v2du) __A * (__v2du) __B); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_mullo_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_mullo_epi64(__A, __B), + (__v2di)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_mullo_epi64(__mmask8 __U, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_mullo_epi64(__A, __B), + (__v2di)_mm_setzero_si128()); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_mask_andnot_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_andnot_pd(__A, __B), + (__v4df)__W); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_maskz_andnot_pd(__mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_andnot_pd(__A, __B), + (__v4df)_mm256_setzero_pd()); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask_andnot_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_andnot_pd(__A, __B), + (__v2df)__W); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_maskz_andnot_pd(__mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_andnot_pd(__A, __B), + (__v2df)_mm_setzero_pd()); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_mask_andnot_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_andnot_ps(__A, __B), + (__v8sf)__W); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_maskz_andnot_ps(__mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_andnot_ps(__A, __B), + (__v8sf)_mm256_setzero_ps()); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask_andnot_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_andnot_ps(__A, __B), + (__v4sf)__W); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_andnot_ps(__mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_andnot_ps(__A, __B), + (__v4sf)_mm_setzero_ps()); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_mask_and_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_and_pd(__A, __B), + (__v4df)__W); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_maskz_and_pd(__mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_and_pd(__A, __B), + (__v4df)_mm256_setzero_pd()); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask_and_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_and_pd(__A, __B), + (__v2df)__W); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_maskz_and_pd(__mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_and_pd(__A, __B), + (__v2df)_mm_setzero_pd()); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_mask_and_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_and_ps(__A, __B), + (__v8sf)__W); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_maskz_and_ps(__mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_and_ps(__A, __B), + (__v8sf)_mm256_setzero_ps()); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask_and_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_and_ps(__A, __B), + (__v4sf)__W); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_and_ps(__mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_and_ps(__A, __B), + (__v4sf)_mm_setzero_ps()); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_mask_xor_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_xor_pd(__A, __B), + (__v4df)__W); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_maskz_xor_pd(__mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_xor_pd(__A, __B), + (__v4df)_mm256_setzero_pd()); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask_xor_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_xor_pd(__A, __B), + (__v2df)__W); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_maskz_xor_pd (__mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_xor_pd(__A, __B), + (__v2df)_mm_setzero_pd()); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_mask_xor_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_xor_ps(__A, __B), + (__v8sf)__W); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_maskz_xor_ps(__mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_xor_ps(__A, __B), + (__v8sf)_mm256_setzero_ps()); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask_xor_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_xor_ps(__A, __B), + (__v4sf)__W); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_xor_ps(__mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_xor_ps(__A, __B), + (__v4sf)_mm_setzero_ps()); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_mask_or_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_or_pd(__A, __B), + (__v4df)__W); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_maskz_or_pd(__mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_or_pd(__A, __B), + (__v4df)_mm256_setzero_pd()); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask_or_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_or_pd(__A, __B), + (__v2df)__W); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_maskz_or_pd(__mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_or_pd(__A, __B), + (__v2df)_mm_setzero_pd()); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_mask_or_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_or_ps(__A, __B), + (__v8sf)__W); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_maskz_or_ps(__mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_or_ps(__A, __B), + (__v8sf)_mm256_setzero_ps()); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask_or_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_or_ps(__A, __B), + (__v4sf)__W); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_or_ps(__mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_or_ps(__A, __B), + (__v4sf)_mm_setzero_ps()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_cvtpd_epi64 (__m128d __A) { + return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A, + (__v2di) _mm_setzero_si128(), + (__mmask8) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvtpd_epi64 (__m128i __W, __mmask8 __U, __m128d __A) { + return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A, + (__v2di) __W, + (__mmask8) __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtpd_epi64 (__mmask8 __U, __m128d __A) { + return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A, + (__v2di) _mm_setzero_si128(), + (__mmask8) __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cvtpd_epi64 (__m256d __A) { + return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A, + (__v4di) _mm256_setzero_si256(), + (__mmask8) -1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtpd_epi64 (__m256i __W, __mmask8 __U, __m256d __A) { + return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A, + (__v4di) __W, + (__mmask8) __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtpd_epi64 (__mmask8 __U, __m256d __A) { + return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A, + (__v4di) _mm256_setzero_si256(), + (__mmask8) __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_cvtpd_epu64 (__m128d __A) { + return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A, + (__v2di) _mm_setzero_si128(), + (__mmask8) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvtpd_epu64 (__m128i __W, __mmask8 __U, __m128d __A) { + return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A, + (__v2di) __W, + (__mmask8) __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtpd_epu64 (__mmask8 __U, __m128d __A) { + return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A, + (__v2di) _mm_setzero_si128(), + (__mmask8) __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cvtpd_epu64 (__m256d __A) { + return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A, + (__v4di) _mm256_setzero_si256(), + (__mmask8) -1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtpd_epu64 (__m256i __W, __mmask8 __U, __m256d __A) { + return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A, + (__v4di) __W, + (__mmask8) __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtpd_epu64 (__mmask8 __U, __m256d __A) { + return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A, + (__v4di) _mm256_setzero_si256(), + (__mmask8) __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_cvtps_epi64 (__m128 __A) { + return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A, + (__v2di) _mm_setzero_si128(), + (__mmask8) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvtps_epi64 (__m128i __W, __mmask8 __U, __m128 __A) { + return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A, + (__v2di) __W, + (__mmask8) __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtps_epi64 (__mmask8 __U, __m128 __A) { + return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A, + (__v2di) _mm_setzero_si128(), + (__mmask8) __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cvtps_epi64 (__m128 __A) { + return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A, + (__v4di) _mm256_setzero_si256(), + (__mmask8) -1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtps_epi64 (__m256i __W, __mmask8 __U, __m128 __A) { + return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A, + (__v4di) __W, + (__mmask8) __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtps_epi64 (__mmask8 __U, __m128 __A) { + return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A, + (__v4di) _mm256_setzero_si256(), + (__mmask8) __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_cvtps_epu64 (__m128 __A) { + return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A, + (__v2di) _mm_setzero_si128(), + (__mmask8) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvtps_epu64 (__m128i __W, __mmask8 __U, __m128 __A) { + return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A, + (__v2di) __W, + (__mmask8) __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtps_epu64 (__mmask8 __U, __m128 __A) { + return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A, + (__v2di) _mm_setzero_si128(), + (__mmask8) __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cvtps_epu64 (__m128 __A) { + return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A, + (__v4di) _mm256_setzero_si256(), + (__mmask8) -1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtps_epu64 (__m256i __W, __mmask8 __U, __m128 __A) { + return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A, + (__v4di) __W, + (__mmask8) __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtps_epu64 (__mmask8 __U, __m128 __A) { + return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A, + (__v4di) _mm256_setzero_si256(), + (__mmask8) __U); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_cvtepi64_pd (__m128i __A) { + return (__m128d)__builtin_convertvector((__v2di)__A, __v2df); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask_cvtepi64_pd (__m128d __W, __mmask8 __U, __m128i __A) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_cvtepi64_pd(__A), + (__v2df)__W); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtepi64_pd (__mmask8 __U, __m128i __A) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_cvtepi64_pd(__A), + (__v2df)_mm_setzero_pd()); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_cvtepi64_pd (__m256i __A) { + return (__m256d)__builtin_convertvector((__v4di)__A, __v4df); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtepi64_pd (__m256d __W, __mmask8 __U, __m256i __A) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_cvtepi64_pd(__A), + (__v4df)__W); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtepi64_pd (__mmask8 __U, __m256i __A) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_cvtepi64_pd(__A), + (__v4df)_mm256_setzero_pd()); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_cvtepi64_ps (__m128i __A) { + return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A, + (__v4sf) _mm_setzero_ps(), + (__mmask8) -1); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask_cvtepi64_ps (__m128 __W, __mmask8 __U, __m128i __A) { + return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A, + (__v4sf) __W, + (__mmask8) __U); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtepi64_ps (__mmask8 __U, __m128i __A) { + return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A, + (__v4sf) _mm_setzero_ps(), + (__mmask8) __U); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS256 +_mm256_cvtepi64_ps (__m256i __A) { + return (__m128)__builtin_convertvector((__v4di)__A, __v4sf); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtepi64_ps (__m128 __W, __mmask8 __U, __m256i __A) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm256_cvtepi64_ps(__A), + (__v4sf)__W); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtepi64_ps (__mmask8 __U, __m256i __A) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm256_cvtepi64_ps(__A), + (__v4sf)_mm_setzero_ps()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_cvttpd_epi64 (__m128d __A) { + return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A, + (__v2di) _mm_setzero_si128(), + (__mmask8) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvttpd_epi64 (__m128i __W, __mmask8 __U, __m128d __A) { + return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A, + (__v2di) __W, + (__mmask8) __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvttpd_epi64 (__mmask8 __U, __m128d __A) { + return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A, + (__v2di) _mm_setzero_si128(), + (__mmask8) __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cvttpd_epi64 (__m256d __A) { + return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A, + (__v4di) _mm256_setzero_si256(), + (__mmask8) -1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvttpd_epi64 (__m256i __W, __mmask8 __U, __m256d __A) { + return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A, + (__v4di) __W, + (__mmask8) __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvttpd_epi64 (__mmask8 __U, __m256d __A) { + return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A, + (__v4di) _mm256_setzero_si256(), + (__mmask8) __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_cvttpd_epu64 (__m128d __A) { + return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A, + (__v2di) _mm_setzero_si128(), + (__mmask8) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvttpd_epu64 (__m128i __W, __mmask8 __U, __m128d __A) { + return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A, + (__v2di) __W, + (__mmask8) __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvttpd_epu64 (__mmask8 __U, __m128d __A) { + return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A, + (__v2di) _mm_setzero_si128(), + (__mmask8) __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cvttpd_epu64 (__m256d __A) { + return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A, + (__v4di) _mm256_setzero_si256(), + (__mmask8) -1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvttpd_epu64 (__m256i __W, __mmask8 __U, __m256d __A) { + return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A, + (__v4di) __W, + (__mmask8) __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvttpd_epu64 (__mmask8 __U, __m256d __A) { + return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A, + (__v4di) _mm256_setzero_si256(), + (__mmask8) __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_cvttps_epi64 (__m128 __A) { + return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A, + (__v2di) _mm_setzero_si128(), + (__mmask8) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvttps_epi64 (__m128i __W, __mmask8 __U, __m128 __A) { + return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A, + (__v2di) __W, + (__mmask8) __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvttps_epi64 (__mmask8 __U, __m128 __A) { + return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A, + (__v2di) _mm_setzero_si128(), + (__mmask8) __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cvttps_epi64 (__m128 __A) { + return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A, + (__v4di) _mm256_setzero_si256(), + (__mmask8) -1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvttps_epi64 (__m256i __W, __mmask8 __U, __m128 __A) { + return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A, + (__v4di) __W, + (__mmask8) __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvttps_epi64 (__mmask8 __U, __m128 __A) { + return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A, + (__v4di) _mm256_setzero_si256(), + (__mmask8) __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_cvttps_epu64 (__m128 __A) { + return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A, + (__v2di) _mm_setzero_si128(), + (__mmask8) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvttps_epu64 (__m128i __W, __mmask8 __U, __m128 __A) { + return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A, + (__v2di) __W, + (__mmask8) __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvttps_epu64 (__mmask8 __U, __m128 __A) { + return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A, + (__v2di) _mm_setzero_si128(), + (__mmask8) __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cvttps_epu64 (__m128 __A) { + return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A, + (__v4di) _mm256_setzero_si256(), + (__mmask8) -1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvttps_epu64 (__m256i __W, __mmask8 __U, __m128 __A) { + return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A, + (__v4di) __W, + (__mmask8) __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvttps_epu64 (__mmask8 __U, __m128 __A) { + return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A, + (__v4di) _mm256_setzero_si256(), + (__mmask8) __U); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_cvtepu64_pd (__m128i __A) { + return (__m128d)__builtin_convertvector((__v2du)__A, __v2df); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask_cvtepu64_pd (__m128d __W, __mmask8 __U, __m128i __A) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_cvtepu64_pd(__A), + (__v2df)__W); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtepu64_pd (__mmask8 __U, __m128i __A) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_cvtepu64_pd(__A), + (__v2df)_mm_setzero_pd()); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_cvtepu64_pd (__m256i __A) { + return (__m256d)__builtin_convertvector((__v4du)__A, __v4df); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtepu64_pd (__m256d __W, __mmask8 __U, __m256i __A) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_cvtepu64_pd(__A), + (__v4df)__W); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtepu64_pd (__mmask8 __U, __m256i __A) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_cvtepu64_pd(__A), + (__v4df)_mm256_setzero_pd()); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_cvtepu64_ps (__m128i __A) { + return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A, + (__v4sf) _mm_setzero_ps(), + (__mmask8) -1); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask_cvtepu64_ps (__m128 __W, __mmask8 __U, __m128i __A) { + return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A, + (__v4sf) __W, + (__mmask8) __U); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtepu64_ps (__mmask8 __U, __m128i __A) { + return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A, + (__v4sf) _mm_setzero_ps(), + (__mmask8) __U); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS256 +_mm256_cvtepu64_ps (__m256i __A) { + return (__m128)__builtin_convertvector((__v4du)__A, __v4sf); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtepu64_ps (__m128 __W, __mmask8 __U, __m256i __A) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm256_cvtepu64_ps(__A), + (__v4sf)__W); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtepu64_ps (__mmask8 __U, __m256i __A) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm256_cvtepu64_ps(__A), + (__v4sf)_mm_setzero_ps()); +} + +#define _mm_range_pd(A, B, C) \ + ((__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), (int)(C), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)-1)) + +#define _mm_mask_range_pd(W, U, A, B, C) \ + ((__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), (int)(C), \ + (__v2df)(__m128d)(W), \ + (__mmask8)(U))) + +#define _mm_maskz_range_pd(U, A, B, C) \ + ((__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), (int)(C), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(U))) + +#define _mm256_range_pd(A, B, C) \ + ((__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \ + (__v4df)(__m256d)(B), (int)(C), \ + (__v4df)_mm256_setzero_pd(), \ + (__mmask8)-1)) + +#define _mm256_mask_range_pd(W, U, A, B, C) \ + ((__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \ + (__v4df)(__m256d)(B), (int)(C), \ + (__v4df)(__m256d)(W), \ + (__mmask8)(U))) + +#define _mm256_maskz_range_pd(U, A, B, C) \ + ((__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \ + (__v4df)(__m256d)(B), (int)(C), \ + (__v4df)_mm256_setzero_pd(), \ + (__mmask8)(U))) + +#define _mm_range_ps(A, B, C) \ + ((__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), (int)(C), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)-1)) + +#define _mm_mask_range_ps(W, U, A, B, C) \ + ((__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), (int)(C), \ + (__v4sf)(__m128)(W), (__mmask8)(U))) + +#define _mm_maskz_range_ps(U, A, B, C) \ + ((__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), (int)(C), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(U))) + +#define _mm256_range_ps(A, B, C) \ + ((__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \ + (__v8sf)(__m256)(B), (int)(C), \ + (__v8sf)_mm256_setzero_ps(), \ + (__mmask8)-1)) + +#define _mm256_mask_range_ps(W, U, A, B, C) \ + ((__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \ + (__v8sf)(__m256)(B), (int)(C), \ + (__v8sf)(__m256)(W), (__mmask8)(U))) + +#define _mm256_maskz_range_ps(U, A, B, C) \ + ((__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \ + (__v8sf)(__m256)(B), (int)(C), \ + (__v8sf)_mm256_setzero_ps(), \ + (__mmask8)(U))) + +#define _mm_reduce_pd(A, B) \ + ((__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)-1)) + +#define _mm_mask_reduce_pd(W, U, A, B) \ + ((__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \ + (__v2df)(__m128d)(W), \ + (__mmask8)(U))) + +#define _mm_maskz_reduce_pd(U, A, B) \ + ((__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(U))) + +#define _mm256_reduce_pd(A, B) \ + ((__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \ + (__v4df)_mm256_setzero_pd(), \ + (__mmask8)-1)) + +#define _mm256_mask_reduce_pd(W, U, A, B) \ + ((__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \ + (__v4df)(__m256d)(W), \ + (__mmask8)(U))) + +#define _mm256_maskz_reduce_pd(U, A, B) \ + ((__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \ + (__v4df)_mm256_setzero_pd(), \ + (__mmask8)(U))) + +#define _mm_reduce_ps(A, B) \ + ((__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)-1)) + +#define _mm_mask_reduce_ps(W, U, A, B) \ + ((__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \ + (__v4sf)(__m128)(W), \ + (__mmask8)(U))) + +#define _mm_maskz_reduce_ps(U, A, B) \ + ((__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(U))) + +#define _mm256_reduce_ps(A, B) \ + ((__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \ + (__v8sf)_mm256_setzero_ps(), \ + (__mmask8)-1)) + +#define _mm256_mask_reduce_ps(W, U, A, B) \ + ((__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \ + (__v8sf)(__m256)(W), \ + (__mmask8)(U))) + +#define _mm256_maskz_reduce_ps(U, A, B) \ + ((__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \ + (__v8sf)_mm256_setzero_ps(), \ + (__mmask8)(U))) + +static __inline__ __mmask8 __DEFAULT_FN_ATTRS128 +_mm_movepi32_mask (__m128i __A) +{ + return (__mmask8) __builtin_ia32_cvtd2mask128 ((__v4si) __A); +} + +static __inline__ __mmask8 __DEFAULT_FN_ATTRS256 +_mm256_movepi32_mask (__m256i __A) +{ + return (__mmask8) __builtin_ia32_cvtd2mask256 ((__v8si) __A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_movm_epi32 (__mmask8 __A) +{ + return (__m128i) __builtin_ia32_cvtmask2d128 (__A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_movm_epi32 (__mmask8 __A) +{ + return (__m256i) __builtin_ia32_cvtmask2d256 (__A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_movm_epi64 (__mmask8 __A) +{ + return (__m128i) __builtin_ia32_cvtmask2q128 (__A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_movm_epi64 (__mmask8 __A) +{ + return (__m256i) __builtin_ia32_cvtmask2q256 (__A); +} + +static __inline__ __mmask8 __DEFAULT_FN_ATTRS128 +_mm_movepi64_mask (__m128i __A) +{ + return (__mmask8) __builtin_ia32_cvtq2mask128 ((__v2di) __A); +} + +static __inline__ __mmask8 __DEFAULT_FN_ATTRS256 +_mm256_movepi64_mask (__m256i __A) +{ + return (__mmask8) __builtin_ia32_cvtq2mask256 ((__v4di) __A); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_broadcast_f32x2 (__m128 __A) +{ + return (__m256)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A, + 0, 1, 0, 1, 0, 1, 0, 1); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_mask_broadcast_f32x2 (__m256 __O, __mmask8 __M, __m128 __A) +{ + return (__m256)__builtin_ia32_selectps_256((__mmask8)__M, + (__v8sf)_mm256_broadcast_f32x2(__A), + (__v8sf)__O); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_maskz_broadcast_f32x2 (__mmask8 __M, __m128 __A) +{ + return (__m256)__builtin_ia32_selectps_256((__mmask8)__M, + (__v8sf)_mm256_broadcast_f32x2(__A), + (__v8sf)_mm256_setzero_ps()); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_broadcast_f64x2(__m128d __A) +{ + return (__m256d)__builtin_shufflevector((__v2df)__A, (__v2df)__A, + 0, 1, 0, 1); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_mask_broadcast_f64x2(__m256d __O, __mmask8 __M, __m128d __A) +{ + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__M, + (__v4df)_mm256_broadcast_f64x2(__A), + (__v4df)__O); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_maskz_broadcast_f64x2 (__mmask8 __M, __m128d __A) +{ + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__M, + (__v4df)_mm256_broadcast_f64x2(__A), + (__v4df)_mm256_setzero_pd()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_broadcast_i32x2 (__m128i __A) +{ + return (__m128i)__builtin_shufflevector((__v4si)__A, (__v4si)__A, + 0, 1, 0, 1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_broadcast_i32x2 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, + (__v4si)_mm_broadcast_i32x2(__A), + (__v4si)__O); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A) +{ + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, + (__v4si)_mm_broadcast_i32x2(__A), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_broadcast_i32x2 (__m128i __A) +{ + return (__m256i)__builtin_shufflevector((__v4si)__A, (__v4si)__A, + 0, 1, 0, 1, 0, 1, 0, 1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_broadcast_i32x2 (__m256i __O, __mmask8 __M, __m128i __A) +{ + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, + (__v8si)_mm256_broadcast_i32x2(__A), + (__v8si)__O); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A) +{ + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, + (__v8si)_mm256_broadcast_i32x2(__A), + (__v8si)_mm256_setzero_si256()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_broadcast_i64x2(__m128i __A) +{ + return (__m256i)__builtin_shufflevector((__v2di)__A, (__v2di)__A, + 0, 1, 0, 1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_broadcast_i64x2(__m256i __O, __mmask8 __M, __m128i __A) +{ + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, + (__v4di)_mm256_broadcast_i64x2(__A), + (__v4di)__O); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A) +{ + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, + (__v4di)_mm256_broadcast_i64x2(__A), + (__v4di)_mm256_setzero_si256()); +} + +#define _mm256_extractf64x2_pd(A, imm) \ + ((__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \ + (int)(imm), \ + (__v2df)_mm_undefined_pd(), \ + (__mmask8)-1)) + +#define _mm256_mask_extractf64x2_pd(W, U, A, imm) \ + ((__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \ + (int)(imm), \ + (__v2df)(__m128d)(W), \ + (__mmask8)(U))) + +#define _mm256_maskz_extractf64x2_pd(U, A, imm) \ + ((__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \ + (int)(imm), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(U))) + +#define _mm256_extracti64x2_epi64(A, imm) \ + ((__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \ + (int)(imm), \ + (__v2di)_mm_undefined_si128(), \ + (__mmask8)-1)) + +#define _mm256_mask_extracti64x2_epi64(W, U, A, imm) \ + ((__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \ + (int)(imm), \ + (__v2di)(__m128i)(W), \ + (__mmask8)(U))) + +#define _mm256_maskz_extracti64x2_epi64(U, A, imm) \ + ((__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \ + (int)(imm), \ + (__v2di)_mm_setzero_si128(), \ + (__mmask8)(U))) + +#define _mm256_insertf64x2(A, B, imm) \ + ((__m256d)__builtin_ia32_insertf64x2_256((__v4df)(__m256d)(A), \ + (__v2df)(__m128d)(B), (int)(imm))) + +#define _mm256_mask_insertf64x2(W, U, A, B, imm) \ + ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ + (__v4df)_mm256_insertf64x2((A), (B), (imm)), \ + (__v4df)(__m256d)(W))) + +#define _mm256_maskz_insertf64x2(U, A, B, imm) \ + ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ + (__v4df)_mm256_insertf64x2((A), (B), (imm)), \ + (__v4df)_mm256_setzero_pd())) + +#define _mm256_inserti64x2(A, B, imm) \ + ((__m256i)__builtin_ia32_inserti64x2_256((__v4di)(__m256i)(A), \ + (__v2di)(__m128i)(B), (int)(imm))) + +#define _mm256_mask_inserti64x2(W, U, A, B, imm) \ + ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ + (__v4di)_mm256_inserti64x2((A), (B), (imm)), \ + (__v4di)(__m256i)(W))) + +#define _mm256_maskz_inserti64x2(U, A, B, imm) \ + ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ + (__v4di)_mm256_inserti64x2((A), (B), (imm)), \ + (__v4di)_mm256_setzero_si256())) + +#define _mm_mask_fpclass_pd_mask(U, A, imm) \ + ((__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)(__m128d)(A), (int)(imm), \ + (__mmask8)(U))) + +#define _mm_fpclass_pd_mask(A, imm) \ + ((__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)(__m128d)(A), (int)(imm), \ + (__mmask8)-1)) + +#define _mm256_mask_fpclass_pd_mask(U, A, imm) \ + ((__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)(__m256d)(A), (int)(imm), \ + (__mmask8)(U))) + +#define _mm256_fpclass_pd_mask(A, imm) \ + ((__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)(__m256d)(A), (int)(imm), \ + (__mmask8)-1)) + +#define _mm_mask_fpclass_ps_mask(U, A, imm) \ + ((__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)(__m128)(A), (int)(imm), \ + (__mmask8)(U))) + +#define _mm_fpclass_ps_mask(A, imm) \ + ((__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)(__m128)(A), (int)(imm), \ + (__mmask8)-1)) + +#define _mm256_mask_fpclass_ps_mask(U, A, imm) \ + ((__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)(__m256)(A), (int)(imm), \ + (__mmask8)(U))) + +#define _mm256_fpclass_ps_mask(A, imm) \ + ((__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)(__m256)(A), (int)(imm), \ + (__mmask8)-1)) + +#undef __DEFAULT_FN_ATTRS128 +#undef __DEFAULT_FN_ATTRS256 + +#endif diff --git a/include-llvm/avx512vlfp16intrin.h b/include-llvm/avx512vlfp16intrin.h new file mode 100644 index 0000000..3d27853 --- /dev/null +++ b/include-llvm/avx512vlfp16intrin.h @@ -0,0 +1,2068 @@ +/*===---------- avx512vlfp16intrin.h - AVX512-FP16 intrinsics --------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error \ + "Never use directly; include instead." +#endif + +#ifndef __AVX512VLFP16INTRIN_H +#define __AVX512VLFP16INTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS256 \ + __attribute__((__always_inline__, __nodebug__, \ + __target__("avx512fp16, avx512vl"), \ + __min_vector_width__(256))) +#define __DEFAULT_FN_ATTRS128 \ + __attribute__((__always_inline__, __nodebug__, \ + __target__("avx512fp16, avx512vl"), \ + __min_vector_width__(128))) + +static __inline__ _Float16 __DEFAULT_FN_ATTRS128 _mm_cvtsh_h(__m128h __a) { + return __a[0]; +} + +static __inline__ _Float16 __DEFAULT_FN_ATTRS256 _mm256_cvtsh_h(__m256h __a) { + return __a[0]; +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_set_sh(_Float16 __h) { + return __extension__(__m128h){__h, 0, 0, 0, 0, 0, 0, 0}; +} + +static __inline __m128h __DEFAULT_FN_ATTRS128 _mm_set1_ph(_Float16 __h) { + return (__m128h)(__v8hf){__h, __h, __h, __h, __h, __h, __h, __h}; +} + +static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_set1_ph(_Float16 __h) { + return (__m256h)(__v16hf){__h, __h, __h, __h, __h, __h, __h, __h, + __h, __h, __h, __h, __h, __h, __h, __h}; +} + +static __inline __m128h __DEFAULT_FN_ATTRS128 +_mm_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4, + _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8) { + return (__m128h)(__v8hf){__h8, __h7, __h6, __h5, __h4, __h3, __h2, __h1}; +} + +static __inline __m256h __DEFAULT_FN_ATTRS256 +_mm256_set1_pch(_Float16 _Complex h) { + return (__m256h)_mm256_set1_ps(__builtin_bit_cast(float, h)); +} + +static __inline __m128h __DEFAULT_FN_ATTRS128 +_mm_set1_pch(_Float16 _Complex h) { + return (__m128h)_mm_set1_ps(__builtin_bit_cast(float, h)); +} + +static __inline __m256h __DEFAULT_FN_ATTRS256 +_mm256_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4, + _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8, + _Float16 __h9, _Float16 __h10, _Float16 __h11, _Float16 __h12, + _Float16 __h13, _Float16 __h14, _Float16 __h15, _Float16 __h16) { + return (__m256h)(__v16hf){__h16, __h15, __h14, __h13, __h12, __h11, + __h10, __h9, __h8, __h7, __h6, __h5, + __h4, __h3, __h2, __h1}; +} + +#define _mm_setr_ph(h1, h2, h3, h4, h5, h6, h7, h8) \ + _mm_set_ph((h8), (h7), (h6), (h5), (h4), (h3), (h2), (h1)) + +#define _mm256_setr_ph(h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11, h12, h13, \ + h14, h15, h16) \ + _mm256_set_ph((h16), (h15), (h14), (h13), (h12), (h11), (h10), (h9), (h8), \ + (h7), (h6), (h5), (h4), (h3), (h2), (h1)) + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_add_ph(__m256h __A, + __m256h __B) { + return (__m256h)((__v16hf)__A + (__v16hf)__B); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_add_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { + return (__m256h)__builtin_ia32_selectph_256( + __U, (__v16hf)_mm256_add_ph(__A, __B), (__v16hf)__W); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_maskz_add_ph(__mmask16 __U, __m256h __A, __m256h __B) { + return (__m256h)__builtin_ia32_selectph_256( + __U, (__v16hf)_mm256_add_ph(__A, __B), (__v16hf)_mm256_setzero_ph()); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_add_ph(__m128h __A, + __m128h __B) { + return (__m128h)((__v8hf)__A + (__v8hf)__B); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_add_ph(__m128h __W, + __mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_add_ph(__A, __B), + (__v8hf)__W); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_add_ph(__mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_add_ph(__A, __B), + (__v8hf)_mm_setzero_ph()); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_sub_ph(__m256h __A, + __m256h __B) { + return (__m256h)((__v16hf)__A - (__v16hf)__B); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_sub_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { + return (__m256h)__builtin_ia32_selectph_256( + __U, (__v16hf)_mm256_sub_ph(__A, __B), (__v16hf)__W); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_maskz_sub_ph(__mmask16 __U, __m256h __A, __m256h __B) { + return (__m256h)__builtin_ia32_selectph_256( + __U, (__v16hf)_mm256_sub_ph(__A, __B), (__v16hf)_mm256_setzero_ph()); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sub_ph(__m128h __A, + __m128h __B) { + return (__m128h)((__v8hf)__A - (__v8hf)__B); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sub_ph(__m128h __W, + __mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_sub_ph(__A, __B), + (__v8hf)__W); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sub_ph(__mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_sub_ph(__A, __B), + (__v8hf)_mm_setzero_ph()); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_mul_ph(__m256h __A, + __m256h __B) { + return (__m256h)((__v16hf)__A * (__v16hf)__B); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_mul_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { + return (__m256h)__builtin_ia32_selectph_256( + __U, (__v16hf)_mm256_mul_ph(__A, __B), (__v16hf)__W); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_maskz_mul_ph(__mmask16 __U, __m256h __A, __m256h __B) { + return (__m256h)__builtin_ia32_selectph_256( + __U, (__v16hf)_mm256_mul_ph(__A, __B), (__v16hf)_mm256_setzero_ph()); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mul_ph(__m128h __A, + __m128h __B) { + return (__m128h)((__v8hf)__A * (__v8hf)__B); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_mul_ph(__m128h __W, + __mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_mul_ph(__A, __B), + (__v8hf)__W); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_mul_ph(__mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_mul_ph(__A, __B), + (__v8hf)_mm_setzero_ph()); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_div_ph(__m256h __A, + __m256h __B) { + return (__m256h)((__v16hf)__A / (__v16hf)__B); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_div_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { + return (__m256h)__builtin_ia32_selectph_256( + __U, (__v16hf)_mm256_div_ph(__A, __B), (__v16hf)__W); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_maskz_div_ph(__mmask16 __U, __m256h __A, __m256h __B) { + return (__m256h)__builtin_ia32_selectph_256( + __U, (__v16hf)_mm256_div_ph(__A, __B), (__v16hf)_mm256_setzero_ph()); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_div_ph(__m128h __A, + __m128h __B) { + return (__m128h)((__v8hf)__A / (__v8hf)__B); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_div_ph(__m128h __W, + __mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_div_ph(__A, __B), + (__v8hf)__W); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_div_ph(__mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_div_ph(__A, __B), + (__v8hf)_mm_setzero_ph()); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_min_ph(__m256h __A, + __m256h __B) { + return (__m256h)__builtin_ia32_minph256((__v16hf)__A, (__v16hf)__B); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_min_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { + return (__m256h)__builtin_ia32_selectph_256( + (__mmask16)__U, + (__v16hf)__builtin_ia32_minph256((__v16hf)__A, (__v16hf)__B), + (__v16hf)__W); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_maskz_min_ph(__mmask16 __U, __m256h __A, __m256h __B) { + return (__m256h)__builtin_ia32_selectph_256( + (__mmask16)__U, + (__v16hf)__builtin_ia32_minph256((__v16hf)__A, (__v16hf)__B), + (__v16hf)_mm256_setzero_ph()); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_min_ph(__m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_minph128((__v8hf)__A, (__v8hf)__B); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_min_ph(__m128h __W, + __mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, (__v8hf)__builtin_ia32_minph128((__v8hf)__A, (__v8hf)__B), + (__v8hf)__W); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_min_ph(__mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, (__v8hf)__builtin_ia32_minph128((__v8hf)__A, (__v8hf)__B), + (__v8hf)_mm_setzero_ph()); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_max_ph(__m256h __A, + __m256h __B) { + return (__m256h)__builtin_ia32_maxph256((__v16hf)__A, (__v16hf)__B); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_max_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { + return (__m256h)__builtin_ia32_selectph_256( + (__mmask16)__U, + (__v16hf)__builtin_ia32_maxph256((__v16hf)__A, (__v16hf)__B), + (__v16hf)__W); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_maskz_max_ph(__mmask16 __U, __m256h __A, __m256h __B) { + return (__m256h)__builtin_ia32_selectph_256( + (__mmask16)__U, + (__v16hf)__builtin_ia32_maxph256((__v16hf)__A, (__v16hf)__B), + (__v16hf)_mm256_setzero_ph()); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_max_ph(__m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_maxph128((__v8hf)__A, (__v8hf)__B); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_max_ph(__m128h __W, + __mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, (__v8hf)__builtin_ia32_maxph128((__v8hf)__A, (__v8hf)__B), + (__v8hf)__W); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_max_ph(__mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, (__v8hf)__builtin_ia32_maxph128((__v8hf)__A, (__v8hf)__B), + (__v8hf)_mm_setzero_ph()); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_abs_ph(__m256h __A) { + return (__m256h)_mm256_and_epi32(_mm256_set1_epi32(0x7FFF7FFF), (__m256i)__A); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_abs_ph(__m128h __A) { + return (__m128h)_mm_and_epi32(_mm_set1_epi32(0x7FFF7FFF), (__m128i)__A); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_conj_pch(__m256h __A) { + return (__m256h)_mm256_xor_ps((__m256)__A, _mm256_set1_ps(-0.0f)); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_conj_pch(__m256h __W, __mmask8 __U, __m256h __A) { + return (__m256h)__builtin_ia32_selectps_256( + (__mmask8)__U, (__v8sf)_mm256_conj_pch(__A), (__v8sf)__W); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_maskz_conj_pch(__mmask8 __U, __m256h __A) { + return (__m256h)__builtin_ia32_selectps_256( + (__mmask8)__U, (__v8sf)_mm256_conj_pch(__A), (__v8sf)_mm256_setzero_ps()); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_conj_pch(__m128h __A) { + return (__m128h)_mm_xor_ps((__m128)__A, _mm_set1_ps(-0.0f)); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_conj_pch(__m128h __W, + __mmask8 __U, + __m128h __A) { + return (__m128h)__builtin_ia32_selectps_128( + (__mmask8)__U, (__v4sf)_mm_conj_pch(__A), (__v4sf)__W); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_conj_pch(__mmask8 __U, __m128h __A) { + return (__m128h)__builtin_ia32_selectps_128( + (__mmask8)__U, (__v4sf)_mm_conj_pch(__A), (__v4sf)_mm_setzero_ps()); +} + +#define _mm256_cmp_ph_mask(a, b, p) \ + ((__mmask16)__builtin_ia32_cmpph256_mask( \ + (__v16hf)(__m256h)(a), (__v16hf)(__m256h)(b), (int)(p), (__mmask16)-1)) + +#define _mm256_mask_cmp_ph_mask(m, a, b, p) \ + ((__mmask16)__builtin_ia32_cmpph256_mask( \ + (__v16hf)(__m256h)(a), (__v16hf)(__m256h)(b), (int)(p), (__mmask16)(m))) + +#define _mm_cmp_ph_mask(a, b, p) \ + ((__mmask8)__builtin_ia32_cmpph128_mask( \ + (__v8hf)(__m128h)(a), (__v8hf)(__m128h)(b), (int)(p), (__mmask8)-1)) + +#define _mm_mask_cmp_ph_mask(m, a, b, p) \ + ((__mmask8)__builtin_ia32_cmpph128_mask( \ + (__v8hf)(__m128h)(a), (__v8hf)(__m128h)(b), (int)(p), (__mmask8)(m))) + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_rcp_ph(__m256h __A) { + return (__m256h)__builtin_ia32_rcpph256_mask( + (__v16hf)__A, (__v16hf)_mm256_undefined_ph(), (__mmask16)-1); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_rcp_ph(__m256h __W, __mmask16 __U, __m256h __A) { + return (__m256h)__builtin_ia32_rcpph256_mask((__v16hf)__A, (__v16hf)__W, + (__mmask16)__U); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_maskz_rcp_ph(__mmask16 __U, __m256h __A) { + return (__m256h)__builtin_ia32_rcpph256_mask( + (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rcp_ph(__m128h __A) { + return (__m128h)__builtin_ia32_rcpph128_mask( + (__v8hf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rcp_ph(__m128h __W, + __mmask8 __U, + __m128h __A) { + return (__m128h)__builtin_ia32_rcpph128_mask((__v8hf)__A, (__v8hf)__W, + (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_rcp_ph(__mmask8 __U, + __m128h __A) { + return (__m128h)__builtin_ia32_rcpph128_mask( + (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_rsqrt_ph(__m256h __A) { + return (__m256h)__builtin_ia32_rsqrtph256_mask( + (__v16hf)__A, (__v16hf)_mm256_undefined_ph(), (__mmask16)-1); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_rsqrt_ph(__m256h __W, __mmask16 __U, __m256h __A) { + return (__m256h)__builtin_ia32_rsqrtph256_mask((__v16hf)__A, (__v16hf)__W, + (__mmask16)__U); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_maskz_rsqrt_ph(__mmask16 __U, __m256h __A) { + return (__m256h)__builtin_ia32_rsqrtph256_mask( + (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rsqrt_ph(__m128h __A) { + return (__m128h)__builtin_ia32_rsqrtph128_mask( + (__v8hf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rsqrt_ph(__m128h __W, + __mmask8 __U, + __m128h __A) { + return (__m128h)__builtin_ia32_rsqrtph128_mask((__v8hf)__A, (__v8hf)__W, + (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_rsqrt_ph(__mmask8 __U, __m128h __A) { + return (__m128h)__builtin_ia32_rsqrtph128_mask( + (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_getexp_ph(__m128h __A) { + return (__m128h)__builtin_ia32_getexpph128_mask( + (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask_getexp_ph(__m128h __W, __mmask8 __U, __m128h __A) { + return (__m128h)__builtin_ia32_getexpph128_mask((__v8hf)__A, (__v8hf)__W, + (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_getexp_ph(__mmask8 __U, __m128h __A) { + return (__m128h)__builtin_ia32_getexpph128_mask( + (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_getexp_ph(__m256h __A) { + return (__m256h)__builtin_ia32_getexpph256_mask( + (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_getexp_ph(__m256h __W, __mmask16 __U, __m256h __A) { + return (__m256h)__builtin_ia32_getexpph256_mask((__v16hf)__A, (__v16hf)__W, + (__mmask16)__U); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_maskz_getexp_ph(__mmask16 __U, __m256h __A) { + return (__m256h)__builtin_ia32_getexpph256_mask( + (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U); +} + +#define _mm_getmant_ph(A, B, C) \ + ((__m128h)__builtin_ia32_getmantph128_mask( \ + (__v8hf)(__m128h)(A), (int)(((C) << 2) | (B)), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)-1)) + +#define _mm_mask_getmant_ph(W, U, A, B, C) \ + ((__m128h)__builtin_ia32_getmantph128_mask( \ + (__v8hf)(__m128h)(A), (int)(((C) << 2) | (B)), (__v8hf)(__m128h)(W), \ + (__mmask8)(U))) + +#define _mm_maskz_getmant_ph(U, A, B, C) \ + ((__m128h)__builtin_ia32_getmantph128_mask( \ + (__v8hf)(__m128h)(A), (int)(((C) << 2) | (B)), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)(U))) + +#define _mm256_getmant_ph(A, B, C) \ + ((__m256h)__builtin_ia32_getmantph256_mask( \ + (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), \ + (__v16hf)_mm256_setzero_ph(), (__mmask16)-1)) + +#define _mm256_mask_getmant_ph(W, U, A, B, C) \ + ((__m256h)__builtin_ia32_getmantph256_mask( \ + (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), (__v16hf)(__m256h)(W), \ + (__mmask16)(U))) + +#define _mm256_maskz_getmant_ph(U, A, B, C) \ + ((__m256h)__builtin_ia32_getmantph256_mask( \ + (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), \ + (__v16hf)_mm256_setzero_ph(), (__mmask16)(U))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_scalef_ph(__m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_scalefph128_mask( + (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask_scalef_ph(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + return (__m128h)__builtin_ia32_scalefph128_mask((__v8hf)__A, (__v8hf)__B, + (__v8hf)__W, (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_scalef_ph(__mmask8 __U, __m128h __A, __m128h __B) { + return (__m128h)__builtin_ia32_scalefph128_mask( + (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_scalef_ph(__m256h __A, + __m256h __B) { + return (__m256h)__builtin_ia32_scalefph256_mask( + (__v16hf)__A, (__v16hf)__B, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_scalef_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { + return (__m256h)__builtin_ia32_scalefph256_mask((__v16hf)__A, (__v16hf)__B, + (__v16hf)__W, (__mmask16)__U); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_maskz_scalef_ph(__mmask16 __U, __m256h __A, __m256h __B) { + return (__m256h)__builtin_ia32_scalefph256_mask( + (__v16hf)__A, (__v16hf)__B, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U); +} + +#define _mm_roundscale_ph(A, imm) \ + ((__m128h)__builtin_ia32_rndscaleph_128_mask( \ + (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)-1)) + +#define _mm_mask_roundscale_ph(W, U, A, imm) \ + ((__m128h)__builtin_ia32_rndscaleph_128_mask( \ + (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)(__m128h)(W), (__mmask8)(U))) + +#define _mm_maskz_roundscale_ph(U, A, imm) \ + ((__m128h)__builtin_ia32_rndscaleph_128_mask( \ + (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)_mm_setzero_ph(), \ + (__mmask8)(U))) + +#define _mm256_roundscale_ph(A, imm) \ + ((__m256h)__builtin_ia32_rndscaleph_256_mask( \ + (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_setzero_ph(), \ + (__mmask16)-1)) + +#define _mm256_mask_roundscale_ph(W, U, A, imm) \ + ((__m256h)__builtin_ia32_rndscaleph_256_mask( \ + (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)(__m256h)(W), \ + (__mmask16)(U))) + +#define _mm256_maskz_roundscale_ph(U, A, imm) \ + ((__m256h)__builtin_ia32_rndscaleph_256_mask( \ + (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_setzero_ph(), \ + (__mmask16)(U))) + +#define _mm_reduce_ph(A, imm) \ + ((__m128h)__builtin_ia32_reduceph128_mask((__v8hf)(__m128h)(A), (int)(imm), \ + (__v8hf)_mm_setzero_ph(), \ + (__mmask8)-1)) + +#define _mm_mask_reduce_ph(W, U, A, imm) \ + ((__m128h)__builtin_ia32_reduceph128_mask( \ + (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)(__m128h)(W), (__mmask8)(U))) + +#define _mm_maskz_reduce_ph(U, A, imm) \ + ((__m128h)__builtin_ia32_reduceph128_mask((__v8hf)(__m128h)(A), (int)(imm), \ + (__v8hf)_mm_setzero_ph(), \ + (__mmask8)(U))) + +#define _mm256_reduce_ph(A, imm) \ + ((__m256h)__builtin_ia32_reduceph256_mask((__v16hf)(__m256h)(A), (int)(imm), \ + (__v16hf)_mm256_setzero_ph(), \ + (__mmask16)-1)) + +#define _mm256_mask_reduce_ph(W, U, A, imm) \ + ((__m256h)__builtin_ia32_reduceph256_mask((__v16hf)(__m256h)(A), (int)(imm), \ + (__v16hf)(__m256h)(W), \ + (__mmask16)(U))) + +#define _mm256_maskz_reduce_ph(U, A, imm) \ + ((__m256h)__builtin_ia32_reduceph256_mask((__v16hf)(__m256h)(A), (int)(imm), \ + (__v16hf)_mm256_setzero_ph(), \ + (__mmask16)(U))) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sqrt_ph(__m128h __a) { + return __builtin_ia32_sqrtph((__v8hf)__a); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_ph(__m128h __W, + __mmask8 __U, + __m128h __A) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, (__v8hf)_mm_sqrt_ph(__A), (__v8hf)__W); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_ph(__mmask8 __U, + __m128h __A) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, (__v8hf)_mm_sqrt_ph(__A), (__v8hf)_mm_setzero_ph()); +} + +static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_sqrt_ph(__m256h __a) { + return (__m256h)__builtin_ia32_sqrtph256((__v16hf)__a); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_sqrt_ph(__m256h __W, __mmask16 __U, __m256h __A) { + return (__m256h)__builtin_ia32_selectph_256( + (__mmask16)__U, (__v16hf)_mm256_sqrt_ph(__A), (__v16hf)__W); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_maskz_sqrt_ph(__mmask16 __U, __m256h __A) { + return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U, + (__v16hf)_mm256_sqrt_ph(__A), + (__v16hf)_mm256_setzero_ph()); +} + +#define _mm_mask_fpclass_ph_mask(U, A, imm) \ + ((__mmask8)__builtin_ia32_fpclassph128_mask((__v8hf)(__m128h)(A), \ + (int)(imm), (__mmask8)(U))) + +#define _mm_fpclass_ph_mask(A, imm) \ + ((__mmask8)__builtin_ia32_fpclassph128_mask((__v8hf)(__m128h)(A), \ + (int)(imm), (__mmask8)-1)) + +#define _mm256_mask_fpclass_ph_mask(U, A, imm) \ + ((__mmask16)__builtin_ia32_fpclassph256_mask((__v16hf)(__m256h)(A), \ + (int)(imm), (__mmask16)(U))) + +#define _mm256_fpclass_ph_mask(A, imm) \ + ((__mmask16)__builtin_ia32_fpclassph256_mask((__v16hf)(__m256h)(A), \ + (int)(imm), (__mmask16)-1)) + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtpd_ph(__m128d __A) { + return (__m128h)__builtin_ia32_vcvtpd2ph128_mask( + (__v2df)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtpd_ph(__m128h __W, + __mmask8 __U, + __m128d __A) { + return (__m128h)__builtin_ia32_vcvtpd2ph128_mask((__v2df)__A, (__v8hf)__W, + (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtpd_ph(__mmask8 __U, __m128d __A) { + return (__m128h)__builtin_ia32_vcvtpd2ph128_mask( + (__v2df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS256 _mm256_cvtpd_ph(__m256d __A) { + return (__m128h)__builtin_ia32_vcvtpd2ph256_mask( + (__v4df)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtpd_ph(__m128h __W, __mmask8 __U, __m256d __A) { + return (__m128h)__builtin_ia32_vcvtpd2ph256_mask((__v4df)__A, (__v8hf)__W, + (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtpd_ph(__mmask8 __U, __m256d __A) { + return (__m128h)__builtin_ia32_vcvtpd2ph256_mask( + (__v4df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_cvtph_pd(__m128h __A) { + return (__m128d)__builtin_ia32_vcvtph2pd128_mask( + (__v8hf)__A, (__v2df)_mm_undefined_pd(), (__mmask8)-1); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_cvtph_pd(__m128d __W, + __mmask8 __U, + __m128h __A) { + return (__m128d)__builtin_ia32_vcvtph2pd128_mask((__v8hf)__A, (__v2df)__W, + (__mmask8)__U); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtph_pd(__mmask8 __U, __m128h __A) { + return (__m128d)__builtin_ia32_vcvtph2pd128_mask( + (__v8hf)__A, (__v2df)_mm_setzero_pd(), (__mmask8)__U); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_cvtph_pd(__m128h __A) { + return (__m256d)__builtin_ia32_vcvtph2pd256_mask( + (__v8hf)__A, (__v4df)_mm256_undefined_pd(), (__mmask8)-1); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtph_pd(__m256d __W, __mmask8 __U, __m128h __A) { + return (__m256d)__builtin_ia32_vcvtph2pd256_mask((__v8hf)__A, (__v4df)__W, + (__mmask8)__U); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtph_pd(__mmask8 __U, __m128h __A) { + return (__m256d)__builtin_ia32_vcvtph2pd256_mask( + (__v8hf)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epi16(__m128h __A) { + return (__m128i)__builtin_ia32_vcvtph2w128_mask( + (__v8hf)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvtph_epi16(__m128i __W, __mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvtph2w128_mask((__v8hf)__A, (__v8hi)__W, + (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtph_epi16(__mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvtph2w128_mask( + (__v8hf)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cvtph_epi16(__m256h __A) { + return (__m256i)__builtin_ia32_vcvtph2w256_mask( + (__v16hf)__A, (__v16hi)_mm256_undefined_si256(), (__mmask16)-1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtph_epi16(__m256i __W, __mmask16 __U, __m256h __A) { + return (__m256i)__builtin_ia32_vcvtph2w256_mask((__v16hf)__A, (__v16hi)__W, + (__mmask16)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtph_epi16(__mmask16 __U, __m256h __A) { + return (__m256i)__builtin_ia32_vcvtph2w256_mask( + (__v16hf)__A, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epi16(__m128h __A) { + return (__m128i)__builtin_ia32_vcvttph2w128_mask( + (__v8hf)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvttph_epi16(__m128i __W, __mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvttph2w128_mask((__v8hf)__A, (__v8hi)__W, + (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvttph_epi16(__mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvttph2w128_mask( + (__v8hf)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cvttph_epi16(__m256h __A) { + return (__m256i)__builtin_ia32_vcvttph2w256_mask( + (__v16hf)__A, (__v16hi)_mm256_undefined_si256(), (__mmask16)-1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvttph_epi16(__m256i __W, __mmask16 __U, __m256h __A) { + return (__m256i)__builtin_ia32_vcvttph2w256_mask((__v16hf)__A, (__v16hi)__W, + (__mmask16)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvttph_epi16(__mmask16 __U, __m256h __A) { + return (__m256i)__builtin_ia32_vcvttph2w256_mask( + (__v16hf)__A, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepi16_ph(__m128i __A) { + return (__m128h) __builtin_convertvector((__v8hi)__A, __v8hf); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask_cvtepi16_ph(__m128h __W, __mmask8 __U, __m128i __A) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, (__v8hf)_mm_cvtepi16_ph(__A), (__v8hf)__W); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtepi16_ph(__mmask8 __U, __m128i __A) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, (__v8hf)_mm_cvtepi16_ph(__A), (__v8hf)_mm_setzero_ph()); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_cvtepi16_ph(__m256i __A) { + return (__m256h) __builtin_convertvector((__v16hi)__A, __v16hf); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtepi16_ph(__m256h __W, __mmask16 __U, __m256i __A) { + return (__m256h)__builtin_ia32_selectph_256( + (__mmask16)__U, (__v16hf)_mm256_cvtepi16_ph(__A), (__v16hf)__W); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtepi16_ph(__mmask16 __U, __m256i __A) { + return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U, + (__v16hf)_mm256_cvtepi16_ph(__A), + (__v16hf)_mm256_setzero_ph()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epu16(__m128h __A) { + return (__m128i)__builtin_ia32_vcvtph2uw128_mask( + (__v8hf)__A, (__v8hu)_mm_undefined_si128(), (__mmask8)-1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvtph_epu16(__m128i __W, __mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvtph2uw128_mask((__v8hf)__A, (__v8hu)__W, + (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtph_epu16(__mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvtph2uw128_mask( + (__v8hf)__A, (__v8hu)_mm_setzero_si128(), (__mmask8)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cvtph_epu16(__m256h __A) { + return (__m256i)__builtin_ia32_vcvtph2uw256_mask( + (__v16hf)__A, (__v16hu)_mm256_undefined_si256(), (__mmask16)-1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtph_epu16(__m256i __W, __mmask16 __U, __m256h __A) { + return (__m256i)__builtin_ia32_vcvtph2uw256_mask((__v16hf)__A, (__v16hu)__W, + (__mmask16)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtph_epu16(__mmask16 __U, __m256h __A) { + return (__m256i)__builtin_ia32_vcvtph2uw256_mask( + (__v16hf)__A, (__v16hu)_mm256_setzero_si256(), (__mmask16)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epu16(__m128h __A) { + return (__m128i)__builtin_ia32_vcvttph2uw128_mask( + (__v8hf)__A, (__v8hu)_mm_undefined_si128(), (__mmask8)-1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvttph_epu16(__m128i __W, __mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvttph2uw128_mask((__v8hf)__A, (__v8hu)__W, + (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvttph_epu16(__mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvttph2uw128_mask( + (__v8hf)__A, (__v8hu)_mm_setzero_si128(), (__mmask8)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cvttph_epu16(__m256h __A) { + return (__m256i)__builtin_ia32_vcvttph2uw256_mask( + (__v16hf)__A, (__v16hu)_mm256_undefined_si256(), (__mmask16)-1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvttph_epu16(__m256i __W, __mmask16 __U, __m256h __A) { + return (__m256i)__builtin_ia32_vcvttph2uw256_mask((__v16hf)__A, (__v16hu)__W, + (__mmask16)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvttph_epu16(__mmask16 __U, __m256h __A) { + return (__m256i)__builtin_ia32_vcvttph2uw256_mask( + (__v16hf)__A, (__v16hu)_mm256_setzero_si256(), (__mmask16)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepu16_ph(__m128i __A) { + return (__m128h) __builtin_convertvector((__v8hu)__A, __v8hf); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask_cvtepu16_ph(__m128h __W, __mmask8 __U, __m128i __A) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, (__v8hf)_mm_cvtepu16_ph(__A), (__v8hf)__W); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtepu16_ph(__mmask8 __U, __m128i __A) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, (__v8hf)_mm_cvtepu16_ph(__A), (__v8hf)_mm_setzero_ph()); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_cvtepu16_ph(__m256i __A) { + return (__m256h) __builtin_convertvector((__v16hu)__A, __v16hf); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtepu16_ph(__m256h __W, __mmask16 __U, __m256i __A) { + return (__m256h)__builtin_ia32_selectph_256( + (__mmask16)__U, (__v16hf)_mm256_cvtepu16_ph(__A), (__v16hf)__W); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtepu16_ph(__mmask16 __U, __m256i __A) { + return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U, + (__v16hf)_mm256_cvtepu16_ph(__A), + (__v16hf)_mm256_setzero_ph()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epi32(__m128h __A) { + return (__m128i)__builtin_ia32_vcvtph2dq128_mask( + (__v8hf)__A, (__v4si)_mm_undefined_si128(), (__mmask8)-1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvtph_epi32(__m128i __W, __mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvtph2dq128_mask((__v8hf)__A, (__v4si)__W, + (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtph_epi32(__mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvtph2dq128_mask( + (__v8hf)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cvtph_epi32(__m128h __A) { + return (__m256i)__builtin_ia32_vcvtph2dq256_mask( + (__v8hf)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtph_epi32(__m256i __W, __mmask8 __U, __m128h __A) { + return (__m256i)__builtin_ia32_vcvtph2dq256_mask((__v8hf)__A, (__v8si)__W, + (__mmask8)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtph_epi32(__mmask8 __U, __m128h __A) { + return (__m256i)__builtin_ia32_vcvtph2dq256_mask( + (__v8hf)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epu32(__m128h __A) { + return (__m128i)__builtin_ia32_vcvtph2udq128_mask( + (__v8hf)__A, (__v4su)_mm_undefined_si128(), (__mmask8)-1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvtph_epu32(__m128i __W, __mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvtph2udq128_mask((__v8hf)__A, (__v4su)__W, + (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtph_epu32(__mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvtph2udq128_mask( + (__v8hf)__A, (__v4su)_mm_setzero_si128(), (__mmask8)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cvtph_epu32(__m128h __A) { + return (__m256i)__builtin_ia32_vcvtph2udq256_mask( + (__v8hf)__A, (__v8su)_mm256_undefined_si256(), (__mmask8)-1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtph_epu32(__m256i __W, __mmask8 __U, __m128h __A) { + return (__m256i)__builtin_ia32_vcvtph2udq256_mask((__v8hf)__A, (__v8su)__W, + (__mmask8)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtph_epu32(__mmask8 __U, __m128h __A) { + return (__m256i)__builtin_ia32_vcvtph2udq256_mask( + (__v8hf)__A, (__v8su)_mm256_setzero_si256(), (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepi32_ph(__m128i __A) { + return (__m128h)__builtin_ia32_vcvtdq2ph128_mask( + (__v4si)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask_cvtepi32_ph(__m128h __W, __mmask8 __U, __m128i __A) { + return (__m128h)__builtin_ia32_vcvtdq2ph128_mask((__v4si)__A, (__v8hf)__W, + (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtepi32_ph(__mmask8 __U, __m128i __A) { + return (__m128h)__builtin_ia32_vcvtdq2ph128_mask( + (__v4si)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS256 +_mm256_cvtepi32_ph(__m256i __A) { + return (__m128h) __builtin_convertvector((__v8si)__A, __v8hf); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtepi32_ph(__m128h __W, __mmask8 __U, __m256i __A) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, (__v8hf)_mm256_cvtepi32_ph(__A), (__v8hf)__W); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtepi32_ph(__mmask8 __U, __m256i __A) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, (__v8hf)_mm256_cvtepi32_ph(__A), (__v8hf)_mm_setzero_ph()); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepu32_ph(__m128i __A) { + return (__m128h)__builtin_ia32_vcvtudq2ph128_mask( + (__v4su)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask_cvtepu32_ph(__m128h __W, __mmask8 __U, __m128i __A) { + return (__m128h)__builtin_ia32_vcvtudq2ph128_mask((__v4su)__A, (__v8hf)__W, + (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtepu32_ph(__mmask8 __U, __m128i __A) { + return (__m128h)__builtin_ia32_vcvtudq2ph128_mask( + (__v4su)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS256 +_mm256_cvtepu32_ph(__m256i __A) { + return (__m128h) __builtin_convertvector((__v8su)__A, __v8hf); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtepu32_ph(__m128h __W, __mmask8 __U, __m256i __A) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, (__v8hf)_mm256_cvtepu32_ph(__A), (__v8hf)__W); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtepu32_ph(__mmask8 __U, __m256i __A) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, (__v8hf)_mm256_cvtepu32_ph(__A), (__v8hf)_mm_setzero_ph()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epi32(__m128h __A) { + return (__m128i)__builtin_ia32_vcvttph2dq128_mask( + (__v8hf)__A, (__v4si)_mm_undefined_si128(), (__mmask8)-1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvttph_epi32(__m128i __W, __mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvttph2dq128_mask((__v8hf)__A, (__v4si)__W, + (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvttph_epi32(__mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvttph2dq128_mask( + (__v8hf)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cvttph_epi32(__m128h __A) { + return (__m256i)__builtin_ia32_vcvttph2dq256_mask( + (__v8hf)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvttph_epi32(__m256i __W, __mmask8 __U, __m128h __A) { + return (__m256i)__builtin_ia32_vcvttph2dq256_mask((__v8hf)__A, (__v8si)__W, + (__mmask8)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvttph_epi32(__mmask8 __U, __m128h __A) { + return (__m256i)__builtin_ia32_vcvttph2dq256_mask( + (__v8hf)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epu32(__m128h __A) { + return (__m128i)__builtin_ia32_vcvttph2udq128_mask( + (__v8hf)__A, (__v4su)_mm_undefined_si128(), (__mmask8)-1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvttph_epu32(__m128i __W, __mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvttph2udq128_mask((__v8hf)__A, (__v4su)__W, + (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvttph_epu32(__mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvttph2udq128_mask( + (__v8hf)__A, (__v4su)_mm_setzero_si128(), (__mmask8)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cvttph_epu32(__m128h __A) { + return (__m256i)__builtin_ia32_vcvttph2udq256_mask( + (__v8hf)__A, (__v8su)_mm256_undefined_si256(), (__mmask8)-1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvttph_epu32(__m256i __W, __mmask8 __U, __m128h __A) { + return (__m256i)__builtin_ia32_vcvttph2udq256_mask((__v8hf)__A, (__v8su)__W, + (__mmask8)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvttph_epu32(__mmask8 __U, __m128h __A) { + return (__m256i)__builtin_ia32_vcvttph2udq256_mask( + (__v8hf)__A, (__v8su)_mm256_setzero_si256(), (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepi64_ph(__m128i __A) { + return (__m128h)__builtin_ia32_vcvtqq2ph128_mask( + (__v2di)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask_cvtepi64_ph(__m128h __W, __mmask8 __U, __m128i __A) { + return (__m128h)__builtin_ia32_vcvtqq2ph128_mask((__v2di)__A, (__v8hf)__W, + (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtepi64_ph(__mmask8 __U, __m128i __A) { + return (__m128h)__builtin_ia32_vcvtqq2ph128_mask( + (__v2di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS256 +_mm256_cvtepi64_ph(__m256i __A) { + return (__m128h)__builtin_ia32_vcvtqq2ph256_mask( + (__v4di)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtepi64_ph(__m128h __W, __mmask8 __U, __m256i __A) { + return (__m128h)__builtin_ia32_vcvtqq2ph256_mask((__v4di)__A, (__v8hf)__W, + (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtepi64_ph(__mmask8 __U, __m256i __A) { + return (__m128h)__builtin_ia32_vcvtqq2ph256_mask( + (__v4di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epi64(__m128h __A) { + return (__m128i)__builtin_ia32_vcvtph2qq128_mask( + (__v8hf)__A, (__v2di)_mm_undefined_si128(), (__mmask8)-1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvtph_epi64(__m128i __W, __mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvtph2qq128_mask((__v8hf)__A, (__v2di)__W, + (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtph_epi64(__mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvtph2qq128_mask( + (__v8hf)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cvtph_epi64(__m128h __A) { + return (__m256i)__builtin_ia32_vcvtph2qq256_mask( + (__v8hf)__A, (__v4di)_mm256_undefined_si256(), (__mmask8)-1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtph_epi64(__m256i __W, __mmask8 __U, __m128h __A) { + return (__m256i)__builtin_ia32_vcvtph2qq256_mask((__v8hf)__A, (__v4di)__W, + (__mmask8)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtph_epi64(__mmask8 __U, __m128h __A) { + return (__m256i)__builtin_ia32_vcvtph2qq256_mask( + (__v8hf)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepu64_ph(__m128i __A) { + return (__m128h)__builtin_ia32_vcvtuqq2ph128_mask( + (__v2du)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask_cvtepu64_ph(__m128h __W, __mmask8 __U, __m128i __A) { + return (__m128h)__builtin_ia32_vcvtuqq2ph128_mask((__v2du)__A, (__v8hf)__W, + (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtepu64_ph(__mmask8 __U, __m128i __A) { + return (__m128h)__builtin_ia32_vcvtuqq2ph128_mask( + (__v2du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS256 +_mm256_cvtepu64_ph(__m256i __A) { + return (__m128h)__builtin_ia32_vcvtuqq2ph256_mask( + (__v4du)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtepu64_ph(__m128h __W, __mmask8 __U, __m256i __A) { + return (__m128h)__builtin_ia32_vcvtuqq2ph256_mask((__v4du)__A, (__v8hf)__W, + (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtepu64_ph(__mmask8 __U, __m256i __A) { + return (__m128h)__builtin_ia32_vcvtuqq2ph256_mask( + (__v4du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epu64(__m128h __A) { + return (__m128i)__builtin_ia32_vcvtph2uqq128_mask( + (__v8hf)__A, (__v2du)_mm_undefined_si128(), (__mmask8)-1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvtph_epu64(__m128i __W, __mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvtph2uqq128_mask((__v8hf)__A, (__v2du)__W, + (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtph_epu64(__mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvtph2uqq128_mask( + (__v8hf)__A, (__v2du)_mm_setzero_si128(), (__mmask8)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cvtph_epu64(__m128h __A) { + return (__m256i)__builtin_ia32_vcvtph2uqq256_mask( + (__v8hf)__A, (__v4du)_mm256_undefined_si256(), (__mmask8)-1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtph_epu64(__m256i __W, __mmask8 __U, __m128h __A) { + return (__m256i)__builtin_ia32_vcvtph2uqq256_mask((__v8hf)__A, (__v4du)__W, + (__mmask8)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtph_epu64(__mmask8 __U, __m128h __A) { + return (__m256i)__builtin_ia32_vcvtph2uqq256_mask( + (__v8hf)__A, (__v4du)_mm256_setzero_si256(), (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epi64(__m128h __A) { + return (__m128i)__builtin_ia32_vcvttph2qq128_mask( + (__v8hf)__A, (__v2di)_mm_undefined_si128(), (__mmask8)-1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvttph_epi64(__m128i __W, __mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvttph2qq128_mask((__v8hf)__A, (__v2di)__W, + (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvttph_epi64(__mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvttph2qq128_mask( + (__v8hf)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cvttph_epi64(__m128h __A) { + return (__m256i)__builtin_ia32_vcvttph2qq256_mask( + (__v8hf)__A, (__v4di)_mm256_undefined_si256(), (__mmask8)-1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvttph_epi64(__m256i __W, __mmask8 __U, __m128h __A) { + return (__m256i)__builtin_ia32_vcvttph2qq256_mask((__v8hf)__A, (__v4di)__W, + (__mmask8)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvttph_epi64(__mmask8 __U, __m128h __A) { + return (__m256i)__builtin_ia32_vcvttph2qq256_mask( + (__v8hf)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epu64(__m128h __A) { + return (__m128i)__builtin_ia32_vcvttph2uqq128_mask( + (__v8hf)__A, (__v2du)_mm_undefined_si128(), (__mmask8)-1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvttph_epu64(__m128i __W, __mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvttph2uqq128_mask((__v8hf)__A, (__v2du)__W, + (__mmask8)__U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvttph_epu64(__mmask8 __U, __m128h __A) { + return (__m128i)__builtin_ia32_vcvttph2uqq128_mask( + (__v8hf)__A, (__v2du)_mm_setzero_si128(), (__mmask8)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cvttph_epu64(__m128h __A) { + return (__m256i)__builtin_ia32_vcvttph2uqq256_mask( + (__v8hf)__A, (__v4du)_mm256_undefined_si256(), (__mmask8)-1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvttph_epu64(__m256i __W, __mmask8 __U, __m128h __A) { + return (__m256i)__builtin_ia32_vcvttph2uqq256_mask((__v8hf)__A, (__v4du)__W, + (__mmask8)__U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvttph_epu64(__mmask8 __U, __m128h __A) { + return (__m256i)__builtin_ia32_vcvttph2uqq256_mask( + (__v8hf)__A, (__v4du)_mm256_setzero_si256(), (__mmask8)__U); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtxph_ps(__m128h __A) { + return (__m128)__builtin_ia32_vcvtph2psx128_mask( + (__v8hf)__A, (__v4sf)_mm_undefined_ps(), (__mmask8)-1); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_cvtxph_ps(__m128 __W, + __mmask8 __U, + __m128h __A) { + return (__m128)__builtin_ia32_vcvtph2psx128_mask((__v8hf)__A, (__v4sf)__W, + (__mmask8)__U); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtxph_ps(__mmask8 __U, __m128h __A) { + return (__m128)__builtin_ia32_vcvtph2psx128_mask( + (__v8hf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtxph_ps(__m128h __A) { + return (__m256)__builtin_ia32_vcvtph2psx256_mask( + (__v8hf)__A, (__v8sf)_mm256_undefined_ps(), (__mmask8)-1); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtxph_ps(__m256 __W, __mmask8 __U, __m128h __A) { + return (__m256)__builtin_ia32_vcvtph2psx256_mask((__v8hf)__A, (__v8sf)__W, + (__mmask8)__U); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtxph_ps(__mmask8 __U, __m128h __A) { + return (__m256)__builtin_ia32_vcvtph2psx256_mask( + (__v8hf)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtxps_ph(__m128 __A) { + return (__m128h)__builtin_ia32_vcvtps2phx128_mask( + (__v4sf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtxps_ph(__m128h __W, + __mmask8 __U, + __m128 __A) { + return (__m128h)__builtin_ia32_vcvtps2phx128_mask((__v4sf)__A, (__v8hf)__W, + (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtxps_ph(__mmask8 __U, __m128 __A) { + return (__m128h)__builtin_ia32_vcvtps2phx128_mask( + (__v4sf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS256 _mm256_cvtxps_ph(__m256 __A) { + return (__m128h)__builtin_ia32_vcvtps2phx256_mask( + (__v8sf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtxps_ph(__m128h __W, __mmask8 __U, __m256 __A) { + return (__m128h)__builtin_ia32_vcvtps2phx256_mask((__v8sf)__A, (__v8hf)__W, + (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtxps_ph(__mmask8 __U, __m256 __A) { + return (__m128h)__builtin_ia32_vcvtps2phx256_mask( + (__v8sf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_ph(__m128h __A, + __m128h __B, + __m128h __C) { + return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, + (__v8hf)__C); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmadd_ph(__m128h __A, + __mmask8 __U, + __m128h __B, + __m128h __C) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, + __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), + (__v8hf)__A); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask3_fmadd_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, + __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), + (__v8hf)__C); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_fmadd_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, + __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), + (__v8hf)_mm_setzero_ph()); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsub_ph(__m128h __A, + __m128h __B, + __m128h __C) { + return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, + -(__v8hf)__C); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmsub_ph(__m128h __A, + __mmask8 __U, + __m128h __B, + __m128h __C) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, _mm_fmsub_ph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), + (__v8hf)__A); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_fmsub_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, _mm_fmsub_ph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), + (__v8hf)_mm_setzero_ph()); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask3_fnmadd_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, + __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, (__v8hf)__C), + (__v8hf)__C); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_fnmadd_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, + __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, (__v8hf)__C), + (__v8hf)_mm_setzero_ph()); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_fnmsub_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, + __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, -(__v8hf)__C), + (__v8hf)_mm_setzero_ph()); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmadd_ph(__m256h __A, + __m256h __B, + __m256h __C) { + return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, + (__v16hf)__C); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_fmadd_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) { + return (__m256h)__builtin_ia32_selectph_256( + (__mmask16)__U, + __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C), + (__v16hf)__A); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask3_fmadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) { + return (__m256h)__builtin_ia32_selectph_256( + (__mmask16)__U, + __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C), + (__v16hf)__C); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_maskz_fmadd_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) { + return (__m256h)__builtin_ia32_selectph_256( + (__mmask16)__U, + __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C), + (__v16hf)_mm256_setzero_ph()); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmsub_ph(__m256h __A, + __m256h __B, + __m256h __C) { + return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, + -(__v16hf)__C); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_fmsub_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) { + return (__m256h)__builtin_ia32_selectph_256( + (__mmask16)__U, + __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), + (__v16hf)__A); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_maskz_fmsub_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) { + return (__m256h)__builtin_ia32_selectph_256( + (__mmask16)__U, + __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), + (__v16hf)_mm256_setzero_ph()); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask3_fnmadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) { + return (__m256h)__builtin_ia32_selectph_256( + (__mmask16)__U, + __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, (__v16hf)__C), + (__v16hf)__C); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_maskz_fnmadd_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) { + return (__m256h)__builtin_ia32_selectph_256( + (__mmask16)__U, + __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, (__v16hf)__C), + (__v16hf)_mm256_setzero_ph()); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_maskz_fnmsub_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) { + return (__m256h)__builtin_ia32_selectph_256( + (__mmask16)__U, + __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), + (__v16hf)_mm256_setzero_ph()); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmaddsub_ph(__m128h __A, + __m128h __B, + __m128h __C) { + return (__m128h)__builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, + (__v8hf)__C); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask_fmaddsub_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, + __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), + (__v8hf)__A); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask3_fmaddsub_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, + __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), + (__v8hf)__C); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_fmaddsub_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, + __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), + (__v8hf)_mm_setzero_ph()); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsubadd_ph(__m128h __A, + __m128h __B, + __m128h __C) { + return (__m128h)__builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, + -(__v8hf)__C); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask_fmsubadd_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, + __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C), + (__v8hf)__A); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_fmsubadd_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, + __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C), + (__v8hf)_mm_setzero_ph()); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_fmaddsub_ph(__m256h __A, __m256h __B, __m256h __C) { + return (__m256h)__builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, + (__v16hf)__C); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_fmaddsub_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) { + return (__m256h)__builtin_ia32_selectph_256( + (__mmask16)__U, + __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C), + (__v16hf)__A); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask3_fmaddsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) { + return (__m256h)__builtin_ia32_selectph_256( + (__mmask16)__U, + __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C), + (__v16hf)__C); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_maskz_fmaddsub_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) { + return (__m256h)__builtin_ia32_selectph_256( + (__mmask16)__U, + __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C), + (__v16hf)_mm256_setzero_ph()); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_fmsubadd_ph(__m256h __A, __m256h __B, __m256h __C) { + return (__m256h)__builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, + -(__v16hf)__C); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_fmsubadd_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) { + return (__m256h)__builtin_ia32_selectph_256( + (__mmask16)__U, + __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), + (__v16hf)__A); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_maskz_fmsubadd_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) { + return (__m256h)__builtin_ia32_selectph_256( + (__mmask16)__U, + __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), + (__v16hf)_mm256_setzero_ph()); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask3_fmsub_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, + __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C), + (__v8hf)__C); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask3_fmsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) { + return (__m256h)__builtin_ia32_selectph_256( + (__mmask16)__U, + __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), + (__v16hf)__C); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask3_fmsubadd_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, + __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C), + (__v8hf)__C); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask3_fmsubadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) { + return (__m256h)__builtin_ia32_selectph_256( + (__mmask16)__U, + __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), + (__v16hf)__C); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmadd_ph(__m128h __A, + __m128h __B, + __m128h __C) { + return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, + (__v8hf)__C); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask_fnmadd_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, + __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, (__v8hf)__C), + (__v8hf)__A); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fnmadd_ph(__m256h __A, + __m256h __B, + __m256h __C) { + return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, + (__v16hf)__C); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_fnmadd_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) { + return (__m256h)__builtin_ia32_selectph_256( + (__mmask16)__U, + __builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, (__v16hf)__C), + (__v16hf)__A); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmsub_ph(__m128h __A, + __m128h __B, + __m128h __C) { + return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, + -(__v8hf)__C); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask_fnmsub_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, + __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C), + (__v8hf)__A); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask3_fnmsub_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { + return (__m128h)__builtin_ia32_selectph_128( + (__mmask8)__U, + __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C), + (__v8hf)__C); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fnmsub_ph(__m256h __A, + __m256h __B, + __m256h __C) { + return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, + -(__v16hf)__C); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_fnmsub_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) { + return (__m256h)__builtin_ia32_selectph_256( + (__mmask16)__U, + __builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, -(__v16hf)__C), + (__v16hf)__A); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask3_fnmsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) { + return (__m256h)__builtin_ia32_selectph_256( + (__mmask16)__U, + __builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, -(__v16hf)__C), + (__v16hf)__C); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmul_pch(__m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_vfcmulcph128_mask( + (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask_fcmul_pch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { + return (__m128h)__builtin_ia32_vfcmulcph128_mask((__v4sf)__A, (__v4sf)__B, + (__v4sf)__W, (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_fcmul_pch(__mmask8 __U, __m128h __A, __m128h __B) { + return (__m128h)__builtin_ia32_vfcmulcph128_mask( + (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS128 _mm256_fcmul_pch(__m256h __A, + __m256h __B) { + return (__m256h)__builtin_ia32_vfcmulcph256_mask( + (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_undefined_ph(), (__mmask8)-1); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_fcmul_pch(__m256h __W, __mmask8 __U, __m256h __A, __m256h __B) { + return (__m256h)__builtin_ia32_vfcmulcph256_mask((__v8sf)__A, (__v8sf)__B, + (__v8sf)__W, (__mmask8)__U); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_maskz_fcmul_pch(__mmask8 __U, __m256h __A, __m256h __B) { + return (__m256h)__builtin_ia32_vfcmulcph256_mask( + (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ph(), (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmadd_pch(__m128h __A, + __m128h __B, + __m128h __C) { + return (__m128h)__builtin_ia32_vfcmaddcph128_mask((__v4sf)__A, (__v4sf)__B, + (__v4sf)__C, (__mmask8)-1); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask_fcmadd_pch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { + return (__m128h)__builtin_ia32_selectps_128( + __U, + __builtin_ia32_vfcmaddcph128_mask((__v4sf)__A, (__v4sf)(__m128h)__B, + (__v4sf)__C, (__mmask8)__U), + (__v4sf)__A); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask3_fcmadd_pch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { + return (__m128h)__builtin_ia32_vfcmaddcph128_mask((__v4sf)__A, (__v4sf)__B, + (__v4sf)__C, (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_fcmadd_pch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { + return (__m128h)__builtin_ia32_vfcmaddcph128_maskz( + (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, (__mmask8)__U); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fcmadd_pch(__m256h __A, + __m256h __B, + __m256h __C) { + return (__m256h)__builtin_ia32_vfcmaddcph256_mask((__v8sf)__A, (__v8sf)__B, + (__v8sf)__C, (__mmask8)-1); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_fcmadd_pch(__m256h __A, __mmask8 __U, __m256h __B, __m256h __C) { + return (__m256h)__builtin_ia32_selectps_256( + __U, + __builtin_ia32_vfcmaddcph256_mask((__v8sf)__A, (__v8sf)__B, (__v8sf)__C, + (__mmask8)__U), + (__v8sf)__A); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask3_fcmadd_pch(__m256h __A, __m256h __B, __m256h __C, __mmask8 __U) { + return (__m256h)__builtin_ia32_vfcmaddcph256_mask((__v8sf)__A, (__v8sf)__B, + (__v8sf)__C, (__mmask8)__U); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_maskz_fcmadd_pch(__mmask8 __U, __m256h __A, __m256h __B, __m256h __C) { + return (__m256h)__builtin_ia32_vfcmaddcph256_maskz( + (__v8sf)__A, (__v8sf)__B, (__v8sf)__C, (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmul_pch(__m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_vfmulcph128_mask( + (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmul_pch(__m128h __W, + __mmask8 __U, + __m128h __A, + __m128h __B) { + return (__m128h)__builtin_ia32_vfmulcph128_mask((__v4sf)__A, (__v4sf)__B, + (__v4sf)__W, (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_fmul_pch(__mmask8 __U, __m128h __A, __m128h __B) { + return (__m128h)__builtin_ia32_vfmulcph128_mask( + (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmul_pch(__m256h __A, + __m256h __B) { + return (__m256h)__builtin_ia32_vfmulcph256_mask( + (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_undefined_ph(), (__mmask8)-1); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_fmul_pch(__m256h __W, __mmask8 __U, __m256h __A, __m256h __B) { + return (__m256h)__builtin_ia32_vfmulcph256_mask((__v8sf)__A, (__v8sf)__B, + (__v8sf)__W, (__mmask8)__U); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_maskz_fmul_pch(__mmask8 __U, __m256h __A, __m256h __B) { + return (__m256h)__builtin_ia32_vfmulcph256_mask( + (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ph(), (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_pch(__m128h __A, + __m128h __B, + __m128h __C) { + return (__m128h)__builtin_ia32_vfmaddcph128_mask((__v4sf)__A, (__v4sf)__B, + (__v4sf)__C, (__mmask8)-1); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask_fmadd_pch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { + return (__m128h)__builtin_ia32_selectps_128( + __U, + __builtin_ia32_vfmaddcph128_mask((__v4sf)__A, (__v4sf)__B, (__v4sf)__C, + (__mmask8)__U), + (__v4sf)__A); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_mask3_fmadd_pch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { + return (__m128h)__builtin_ia32_vfmaddcph128_mask((__v4sf)__A, (__v4sf)__B, + (__v4sf)__C, (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_maskz_fmadd_pch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { + return (__m128h)__builtin_ia32_vfmaddcph128_maskz((__v4sf)__A, (__v4sf)__B, + (__v4sf)__C, (__mmask8)__U); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmadd_pch(__m256h __A, + __m256h __B, + __m256h __C) { + return (__m256h)__builtin_ia32_vfmaddcph256_mask((__v8sf)__A, (__v8sf)__B, + (__v8sf)__C, (__mmask8)-1); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_fmadd_pch(__m256h __A, __mmask8 __U, __m256h __B, __m256h __C) { + return (__m256h)__builtin_ia32_selectps_256( + __U, + __builtin_ia32_vfmaddcph256_mask((__v8sf)__A, (__v8sf)__B, (__v8sf)__C, + (__mmask8)__U), + (__v8sf)__A); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask3_fmadd_pch(__m256h __A, __m256h __B, __m256h __C, __mmask8 __U) { + return (__m256h)__builtin_ia32_vfmaddcph256_mask((__v8sf)__A, (__v8sf)__B, + (__v8sf)__C, (__mmask8)__U); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_maskz_fmadd_pch(__mmask8 __U, __m256h __A, __m256h __B, __m256h __C) { + return (__m256h)__builtin_ia32_vfmaddcph256_maskz((__v8sf)__A, (__v8sf)__B, + (__v8sf)__C, (__mmask8)__U); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_blend_ph(__mmask8 __U, + __m128h __A, + __m128h __W) { + return (__m128h)__builtin_ia32_selectph_128((__mmask8)__U, (__v8hf)__W, + (__v8hf)__A); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_mask_blend_ph(__mmask16 __U, __m256h __A, __m256h __W) { + return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U, (__v16hf)__W, + (__v16hf)__A); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_permutex2var_ph(__m128h __A, __m128i __I, __m128h __B) { + return (__m128h)__builtin_ia32_vpermi2varhi128((__v8hi)__A, (__v8hi)__I, + (__v8hi)__B); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_permutex2var_ph(__m256h __A, __m256i __I, __m256h __B) { + return (__m256h)__builtin_ia32_vpermi2varhi256((__v16hi)__A, (__v16hi)__I, + (__v16hi)__B); +} + +static __inline__ __m128h __DEFAULT_FN_ATTRS128 +_mm_permutexvar_ph(__m128i __A, __m128h __B) { + return (__m128h)__builtin_ia32_permvarhi128((__v8hi)__B, (__v8hi)__A); +} + +static __inline__ __m256h __DEFAULT_FN_ATTRS256 +_mm256_permutexvar_ph(__m256i __A, __m256h __B) { + return (__m256h)__builtin_ia32_permvarhi256((__v16hi)__B, (__v16hi)__A); +} + +static __inline__ _Float16 __DEFAULT_FN_ATTRS256 +_mm256_reduce_add_ph(__m256h __W) { + return __builtin_ia32_reduce_fadd_ph256(-0.0f16, __W); +} + +static __inline__ _Float16 __DEFAULT_FN_ATTRS256 +_mm256_reduce_mul_ph(__m256h __W) { + return __builtin_ia32_reduce_fmul_ph256(1.0f16, __W); +} + +static __inline__ _Float16 __DEFAULT_FN_ATTRS256 +_mm256_reduce_max_ph(__m256h __V) { + return __builtin_ia32_reduce_fmax_ph256(__V); +} + +static __inline__ _Float16 __DEFAULT_FN_ATTRS256 +_mm256_reduce_min_ph(__m256h __V) { + return __builtin_ia32_reduce_fmin_ph256(__V); +} + +static __inline__ _Float16 __DEFAULT_FN_ATTRS128 +_mm_reduce_add_ph(__m128h __W) { + return __builtin_ia32_reduce_fadd_ph128(-0.0f16, __W); +} + +static __inline__ _Float16 __DEFAULT_FN_ATTRS128 +_mm_reduce_mul_ph(__m128h __W) { + return __builtin_ia32_reduce_fmul_ph128(1.0f16, __W); +} + +static __inline__ _Float16 __DEFAULT_FN_ATTRS128 +_mm_reduce_max_ph(__m128h __V) { + return __builtin_ia32_reduce_fmax_ph128(__V); +} + +static __inline__ _Float16 __DEFAULT_FN_ATTRS128 +_mm_reduce_min_ph(__m128h __V) { + return __builtin_ia32_reduce_fmin_ph128(__V); +} + +// intrinsics below are alias for f*mul_*ch +#define _mm_mul_pch(A, B) _mm_fmul_pch(A, B) +#define _mm_mask_mul_pch(W, U, A, B) _mm_mask_fmul_pch(W, U, A, B) +#define _mm_maskz_mul_pch(U, A, B) _mm_maskz_fmul_pch(U, A, B) +#define _mm256_mul_pch(A, B) _mm256_fmul_pch(A, B) +#define _mm256_mask_mul_pch(W, U, A, B) _mm256_mask_fmul_pch(W, U, A, B) +#define _mm256_maskz_mul_pch(U, A, B) _mm256_maskz_fmul_pch(U, A, B) + +#define _mm_cmul_pch(A, B) _mm_fcmul_pch(A, B) +#define _mm_mask_cmul_pch(W, U, A, B) _mm_mask_fcmul_pch(W, U, A, B) +#define _mm_maskz_cmul_pch(U, A, B) _mm_maskz_fcmul_pch(U, A, B) +#define _mm256_cmul_pch(A, B) _mm256_fcmul_pch(A, B) +#define _mm256_mask_cmul_pch(W, U, A, B) _mm256_mask_fcmul_pch(W, U, A, B) +#define _mm256_maskz_cmul_pch(U, A, B) _mm256_maskz_fcmul_pch(U, A, B) + +#undef __DEFAULT_FN_ATTRS128 +#undef __DEFAULT_FN_ATTRS256 + +#endif diff --git a/include-llvm/avx512vlintrin.h b/include-llvm/avx512vlintrin.h new file mode 100644 index 0000000..c6b4a44 --- /dev/null +++ b/include-llvm/avx512vlintrin.h @@ -0,0 +1,8485 @@ +/*===---- avx512vlintrin.h - AVX512VL intrinsics ---------------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __AVX512VLINTRIN_H +#define __AVX512VLINTRIN_H + +#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl"), __min_vector_width__(128))) +#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl"), __min_vector_width__(256))) + +typedef short __v2hi __attribute__((__vector_size__(4))); +typedef char __v4qi __attribute__((__vector_size__(4))); +typedef char __v2qi __attribute__((__vector_size__(2))); + +/* Integer compare */ + +#define _mm_cmpeq_epi32_mask(A, B) \ + _mm_cmp_epi32_mask((A), (B), _MM_CMPINT_EQ) +#define _mm_mask_cmpeq_epi32_mask(k, A, B) \ + _mm_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm_cmpge_epi32_mask(A, B) \ + _mm_cmp_epi32_mask((A), (B), _MM_CMPINT_GE) +#define _mm_mask_cmpge_epi32_mask(k, A, B) \ + _mm_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm_cmpgt_epi32_mask(A, B) \ + _mm_cmp_epi32_mask((A), (B), _MM_CMPINT_GT) +#define _mm_mask_cmpgt_epi32_mask(k, A, B) \ + _mm_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm_cmple_epi32_mask(A, B) \ + _mm_cmp_epi32_mask((A), (B), _MM_CMPINT_LE) +#define _mm_mask_cmple_epi32_mask(k, A, B) \ + _mm_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm_cmplt_epi32_mask(A, B) \ + _mm_cmp_epi32_mask((A), (B), _MM_CMPINT_LT) +#define _mm_mask_cmplt_epi32_mask(k, A, B) \ + _mm_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm_cmpneq_epi32_mask(A, B) \ + _mm_cmp_epi32_mask((A), (B), _MM_CMPINT_NE) +#define _mm_mask_cmpneq_epi32_mask(k, A, B) \ + _mm_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_NE) + +#define _mm256_cmpeq_epi32_mask(A, B) \ + _mm256_cmp_epi32_mask((A), (B), _MM_CMPINT_EQ) +#define _mm256_mask_cmpeq_epi32_mask(k, A, B) \ + _mm256_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm256_cmpge_epi32_mask(A, B) \ + _mm256_cmp_epi32_mask((A), (B), _MM_CMPINT_GE) +#define _mm256_mask_cmpge_epi32_mask(k, A, B) \ + _mm256_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm256_cmpgt_epi32_mask(A, B) \ + _mm256_cmp_epi32_mask((A), (B), _MM_CMPINT_GT) +#define _mm256_mask_cmpgt_epi32_mask(k, A, B) \ + _mm256_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm256_cmple_epi32_mask(A, B) \ + _mm256_cmp_epi32_mask((A), (B), _MM_CMPINT_LE) +#define _mm256_mask_cmple_epi32_mask(k, A, B) \ + _mm256_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm256_cmplt_epi32_mask(A, B) \ + _mm256_cmp_epi32_mask((A), (B), _MM_CMPINT_LT) +#define _mm256_mask_cmplt_epi32_mask(k, A, B) \ + _mm256_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm256_cmpneq_epi32_mask(A, B) \ + _mm256_cmp_epi32_mask((A), (B), _MM_CMPINT_NE) +#define _mm256_mask_cmpneq_epi32_mask(k, A, B) \ + _mm256_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_NE) + +#define _mm_cmpeq_epu32_mask(A, B) \ + _mm_cmp_epu32_mask((A), (B), _MM_CMPINT_EQ) +#define _mm_mask_cmpeq_epu32_mask(k, A, B) \ + _mm_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm_cmpge_epu32_mask(A, B) \ + _mm_cmp_epu32_mask((A), (B), _MM_CMPINT_GE) +#define _mm_mask_cmpge_epu32_mask(k, A, B) \ + _mm_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm_cmpgt_epu32_mask(A, B) \ + _mm_cmp_epu32_mask((A), (B), _MM_CMPINT_GT) +#define _mm_mask_cmpgt_epu32_mask(k, A, B) \ + _mm_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm_cmple_epu32_mask(A, B) \ + _mm_cmp_epu32_mask((A), (B), _MM_CMPINT_LE) +#define _mm_mask_cmple_epu32_mask(k, A, B) \ + _mm_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm_cmplt_epu32_mask(A, B) \ + _mm_cmp_epu32_mask((A), (B), _MM_CMPINT_LT) +#define _mm_mask_cmplt_epu32_mask(k, A, B) \ + _mm_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm_cmpneq_epu32_mask(A, B) \ + _mm_cmp_epu32_mask((A), (B), _MM_CMPINT_NE) +#define _mm_mask_cmpneq_epu32_mask(k, A, B) \ + _mm_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_NE) + +#define _mm256_cmpeq_epu32_mask(A, B) \ + _mm256_cmp_epu32_mask((A), (B), _MM_CMPINT_EQ) +#define _mm256_mask_cmpeq_epu32_mask(k, A, B) \ + _mm256_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm256_cmpge_epu32_mask(A, B) \ + _mm256_cmp_epu32_mask((A), (B), _MM_CMPINT_GE) +#define _mm256_mask_cmpge_epu32_mask(k, A, B) \ + _mm256_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm256_cmpgt_epu32_mask(A, B) \ + _mm256_cmp_epu32_mask((A), (B), _MM_CMPINT_GT) +#define _mm256_mask_cmpgt_epu32_mask(k, A, B) \ + _mm256_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm256_cmple_epu32_mask(A, B) \ + _mm256_cmp_epu32_mask((A), (B), _MM_CMPINT_LE) +#define _mm256_mask_cmple_epu32_mask(k, A, B) \ + _mm256_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm256_cmplt_epu32_mask(A, B) \ + _mm256_cmp_epu32_mask((A), (B), _MM_CMPINT_LT) +#define _mm256_mask_cmplt_epu32_mask(k, A, B) \ + _mm256_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm256_cmpneq_epu32_mask(A, B) \ + _mm256_cmp_epu32_mask((A), (B), _MM_CMPINT_NE) +#define _mm256_mask_cmpneq_epu32_mask(k, A, B) \ + _mm256_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_NE) + +#define _mm_cmpeq_epi64_mask(A, B) \ + _mm_cmp_epi64_mask((A), (B), _MM_CMPINT_EQ) +#define _mm_mask_cmpeq_epi64_mask(k, A, B) \ + _mm_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm_cmpge_epi64_mask(A, B) \ + _mm_cmp_epi64_mask((A), (B), _MM_CMPINT_GE) +#define _mm_mask_cmpge_epi64_mask(k, A, B) \ + _mm_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm_cmpgt_epi64_mask(A, B) \ + _mm_cmp_epi64_mask((A), (B), _MM_CMPINT_GT) +#define _mm_mask_cmpgt_epi64_mask(k, A, B) \ + _mm_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm_cmple_epi64_mask(A, B) \ + _mm_cmp_epi64_mask((A), (B), _MM_CMPINT_LE) +#define _mm_mask_cmple_epi64_mask(k, A, B) \ + _mm_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm_cmplt_epi64_mask(A, B) \ + _mm_cmp_epi64_mask((A), (B), _MM_CMPINT_LT) +#define _mm_mask_cmplt_epi64_mask(k, A, B) \ + _mm_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm_cmpneq_epi64_mask(A, B) \ + _mm_cmp_epi64_mask((A), (B), _MM_CMPINT_NE) +#define _mm_mask_cmpneq_epi64_mask(k, A, B) \ + _mm_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_NE) + +#define _mm256_cmpeq_epi64_mask(A, B) \ + _mm256_cmp_epi64_mask((A), (B), _MM_CMPINT_EQ) +#define _mm256_mask_cmpeq_epi64_mask(k, A, B) \ + _mm256_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm256_cmpge_epi64_mask(A, B) \ + _mm256_cmp_epi64_mask((A), (B), _MM_CMPINT_GE) +#define _mm256_mask_cmpge_epi64_mask(k, A, B) \ + _mm256_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm256_cmpgt_epi64_mask(A, B) \ + _mm256_cmp_epi64_mask((A), (B), _MM_CMPINT_GT) +#define _mm256_mask_cmpgt_epi64_mask(k, A, B) \ + _mm256_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm256_cmple_epi64_mask(A, B) \ + _mm256_cmp_epi64_mask((A), (B), _MM_CMPINT_LE) +#define _mm256_mask_cmple_epi64_mask(k, A, B) \ + _mm256_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm256_cmplt_epi64_mask(A, B) \ + _mm256_cmp_epi64_mask((A), (B), _MM_CMPINT_LT) +#define _mm256_mask_cmplt_epi64_mask(k, A, B) \ + _mm256_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm256_cmpneq_epi64_mask(A, B) \ + _mm256_cmp_epi64_mask((A), (B), _MM_CMPINT_NE) +#define _mm256_mask_cmpneq_epi64_mask(k, A, B) \ + _mm256_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_NE) + +#define _mm_cmpeq_epu64_mask(A, B) \ + _mm_cmp_epu64_mask((A), (B), _MM_CMPINT_EQ) +#define _mm_mask_cmpeq_epu64_mask(k, A, B) \ + _mm_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm_cmpge_epu64_mask(A, B) \ + _mm_cmp_epu64_mask((A), (B), _MM_CMPINT_GE) +#define _mm_mask_cmpge_epu64_mask(k, A, B) \ + _mm_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm_cmpgt_epu64_mask(A, B) \ + _mm_cmp_epu64_mask((A), (B), _MM_CMPINT_GT) +#define _mm_mask_cmpgt_epu64_mask(k, A, B) \ + _mm_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm_cmple_epu64_mask(A, B) \ + _mm_cmp_epu64_mask((A), (B), _MM_CMPINT_LE) +#define _mm_mask_cmple_epu64_mask(k, A, B) \ + _mm_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm_cmplt_epu64_mask(A, B) \ + _mm_cmp_epu64_mask((A), (B), _MM_CMPINT_LT) +#define _mm_mask_cmplt_epu64_mask(k, A, B) \ + _mm_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm_cmpneq_epu64_mask(A, B) \ + _mm_cmp_epu64_mask((A), (B), _MM_CMPINT_NE) +#define _mm_mask_cmpneq_epu64_mask(k, A, B) \ + _mm_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_NE) + +#define _mm256_cmpeq_epu64_mask(A, B) \ + _mm256_cmp_epu64_mask((A), (B), _MM_CMPINT_EQ) +#define _mm256_mask_cmpeq_epu64_mask(k, A, B) \ + _mm256_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_EQ) +#define _mm256_cmpge_epu64_mask(A, B) \ + _mm256_cmp_epu64_mask((A), (B), _MM_CMPINT_GE) +#define _mm256_mask_cmpge_epu64_mask(k, A, B) \ + _mm256_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GE) +#define _mm256_cmpgt_epu64_mask(A, B) \ + _mm256_cmp_epu64_mask((A), (B), _MM_CMPINT_GT) +#define _mm256_mask_cmpgt_epu64_mask(k, A, B) \ + _mm256_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GT) +#define _mm256_cmple_epu64_mask(A, B) \ + _mm256_cmp_epu64_mask((A), (B), _MM_CMPINT_LE) +#define _mm256_mask_cmple_epu64_mask(k, A, B) \ + _mm256_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LE) +#define _mm256_cmplt_epu64_mask(A, B) \ + _mm256_cmp_epu64_mask((A), (B), _MM_CMPINT_LT) +#define _mm256_mask_cmplt_epu64_mask(k, A, B) \ + _mm256_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LT) +#define _mm256_cmpneq_epu64_mask(A, B) \ + _mm256_cmp_epu64_mask((A), (B), _MM_CMPINT_NE) +#define _mm256_mask_cmpneq_epu64_mask(k, A, B) \ + _mm256_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_NE) + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_add_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_add_epi32(__A, __B), + (__v8si)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_add_epi32(__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_add_epi32(__A, __B), + (__v8si)_mm256_setzero_si256()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_add_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_add_epi64(__A, __B), + (__v4di)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_add_epi64(__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_add_epi64(__A, __B), + (__v4di)_mm256_setzero_si256()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_sub_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_sub_epi32(__A, __B), + (__v8si)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_sub_epi32(__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_sub_epi32(__A, __B), + (__v8si)_mm256_setzero_si256()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_sub_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_sub_epi64(__A, __B), + (__v4di)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_sub_epi64(__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_sub_epi64(__A, __B), + (__v4di)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_add_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_add_epi32(__A, __B), + (__v4si)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_add_epi32(__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_add_epi32(__A, __B), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_add_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_add_epi64(__A, __B), + (__v2di)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_add_epi64(__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_add_epi64(__A, __B), + (__v2di)_mm_setzero_si128()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_sub_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_sub_epi32(__A, __B), + (__v4si)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_sub_epi32(__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_sub_epi32(__A, __B), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_sub_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_sub_epi64(__A, __B), + (__v2di)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_sub_epi64(__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_sub_epi64(__A, __B), + (__v2di)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_mul_epi32(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, + (__v4di)_mm256_mul_epi32(__X, __Y), + (__v4di)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_mul_epi32(__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, + (__v4di)_mm256_mul_epi32(__X, __Y), + (__v4di)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_mul_epi32(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, + (__v2di)_mm_mul_epi32(__X, __Y), + (__v2di)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_mul_epi32(__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, + (__v2di)_mm_mul_epi32(__X, __Y), + (__v2di)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_mul_epu32(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, + (__v4di)_mm256_mul_epu32(__X, __Y), + (__v4di)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_mul_epu32(__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, + (__v4di)_mm256_mul_epu32(__X, __Y), + (__v4di)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_mul_epu32(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, + (__v2di)_mm_mul_epu32(__X, __Y), + (__v2di)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_mul_epu32(__mmask8 __M, __m128i __X, __m128i __Y) +{ + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, + (__v2di)_mm_mul_epu32(__X, __Y), + (__v2di)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_mullo_epi32(__mmask8 __M, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, + (__v8si)_mm256_mullo_epi32(__A, __B), + (__v8si)_mm256_setzero_si256()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_mullo_epi32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, + (__v8si)_mm256_mullo_epi32(__A, __B), + (__v8si)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_mullo_epi32(__mmask8 __M, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, + (__v4si)_mm_mullo_epi32(__A, __B), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_mullo_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, + (__v4si)_mm_mullo_epi32(__A, __B), + (__v4si)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_and_epi32(__m256i __a, __m256i __b) +{ + return (__m256i)((__v8su)__a & (__v8su)__b); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_and_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_and_epi32(__A, __B), + (__v8si)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_and_epi32(__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i)_mm256_mask_and_epi32(_mm256_setzero_si256(), __U, __A, __B); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_and_epi32(__m128i __a, __m128i __b) +{ + return (__m128i)((__v4su)__a & (__v4su)__b); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_and_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_and_epi32(__A, __B), + (__v4si)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_and_epi32(__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)_mm_mask_and_epi32(_mm_setzero_si128(), __U, __A, __B); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_andnot_epi32(__m256i __A, __m256i __B) +{ + return (__m256i)(~(__v8su)__A & (__v8su)__B); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_andnot_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_andnot_epi32(__A, __B), + (__v8si)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_andnot_epi32(__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i)_mm256_mask_andnot_epi32(_mm256_setzero_si256(), + __U, __A, __B); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_andnot_epi32(__m128i __A, __m128i __B) +{ + return (__m128i)(~(__v4su)__A & (__v4su)__B); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_andnot_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_andnot_epi32(__A, __B), + (__v4si)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_andnot_epi32(__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)_mm_mask_andnot_epi32(_mm_setzero_si128(), __U, __A, __B); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_or_epi32(__m256i __a, __m256i __b) +{ + return (__m256i)((__v8su)__a | (__v8su)__b); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_or_epi32 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_or_epi32(__A, __B), + (__v8si)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_or_epi32(__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i)_mm256_mask_or_epi32(_mm256_setzero_si256(), __U, __A, __B); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_or_epi32(__m128i __a, __m128i __b) +{ + return (__m128i)((__v4su)__a | (__v4su)__b); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_or_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_or_epi32(__A, __B), + (__v4si)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_or_epi32(__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)_mm_mask_or_epi32(_mm_setzero_si128(), __U, __A, __B); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_xor_epi32(__m256i __a, __m256i __b) +{ + return (__m256i)((__v8su)__a ^ (__v8su)__b); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_xor_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_xor_epi32(__A, __B), + (__v8si)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_xor_epi32(__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i)_mm256_mask_xor_epi32(_mm256_setzero_si256(), __U, __A, __B); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_xor_epi32(__m128i __a, __m128i __b) +{ + return (__m128i)((__v4su)__a ^ (__v4su)__b); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_xor_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_xor_epi32(__A, __B), + (__v4si)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_xor_epi32(__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)_mm_mask_xor_epi32(_mm_setzero_si128(), __U, __A, __B); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_and_epi64(__m256i __a, __m256i __b) +{ + return (__m256i)((__v4du)__a & (__v4du)__b); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_and_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_and_epi64(__A, __B), + (__v4di)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_and_epi64(__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i)_mm256_mask_and_epi64(_mm256_setzero_si256(), __U, __A, __B); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_and_epi64(__m128i __a, __m128i __b) +{ + return (__m128i)((__v2du)__a & (__v2du)__b); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_and_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_and_epi64(__A, __B), + (__v2di)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_and_epi64(__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)_mm_mask_and_epi64(_mm_setzero_si128(), __U, __A, __B); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_andnot_epi64(__m256i __A, __m256i __B) +{ + return (__m256i)(~(__v4du)__A & (__v4du)__B); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_andnot_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_andnot_epi64(__A, __B), + (__v4di)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_andnot_epi64(__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i)_mm256_mask_andnot_epi64(_mm256_setzero_si256(), + __U, __A, __B); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_andnot_epi64(__m128i __A, __m128i __B) +{ + return (__m128i)(~(__v2du)__A & (__v2du)__B); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_andnot_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_andnot_epi64(__A, __B), + (__v2di)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_andnot_epi64(__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)_mm_mask_andnot_epi64(_mm_setzero_si128(), __U, __A, __B); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_or_epi64(__m256i __a, __m256i __b) +{ + return (__m256i)((__v4du)__a | (__v4du)__b); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_or_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_or_epi64(__A, __B), + (__v4di)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_or_epi64(__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i)_mm256_mask_or_epi64(_mm256_setzero_si256(), __U, __A, __B); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_or_epi64(__m128i __a, __m128i __b) +{ + return (__m128i)((__v2du)__a | (__v2du)__b); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_or_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_or_epi64(__A, __B), + (__v2di)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_or_epi64(__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)_mm_mask_or_epi64(_mm_setzero_si128(), __U, __A, __B); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_xor_epi64(__m256i __a, __m256i __b) +{ + return (__m256i)((__v4du)__a ^ (__v4du)__b); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_xor_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_xor_epi64(__A, __B), + (__v4di)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_xor_epi64(__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i)_mm256_mask_xor_epi64(_mm256_setzero_si256(), __U, __A, __B); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_xor_epi64(__m128i __a, __m128i __b) +{ + return (__m128i)((__v2du)__a ^ (__v2du)__b); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_xor_epi64(__m128i __W, __mmask8 __U, __m128i __A, + __m128i __B) +{ + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_xor_epi64(__A, __B), + (__v2di)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_xor_epi64(__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)_mm_mask_xor_epi64(_mm_setzero_si128(), __U, __A, __B); +} + +#define _mm_cmp_epi32_mask(a, b, p) \ + ((__mmask8)__builtin_ia32_cmpd128_mask((__v4si)(__m128i)(a), \ + (__v4si)(__m128i)(b), (int)(p), \ + (__mmask8)-1)) + +#define _mm_mask_cmp_epi32_mask(m, a, b, p) \ + ((__mmask8)__builtin_ia32_cmpd128_mask((__v4si)(__m128i)(a), \ + (__v4si)(__m128i)(b), (int)(p), \ + (__mmask8)(m))) + +#define _mm_cmp_epu32_mask(a, b, p) \ + ((__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)(__m128i)(a), \ + (__v4si)(__m128i)(b), (int)(p), \ + (__mmask8)-1)) + +#define _mm_mask_cmp_epu32_mask(m, a, b, p) \ + ((__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)(__m128i)(a), \ + (__v4si)(__m128i)(b), (int)(p), \ + (__mmask8)(m))) + +#define _mm256_cmp_epi32_mask(a, b, p) \ + ((__mmask8)__builtin_ia32_cmpd256_mask((__v8si)(__m256i)(a), \ + (__v8si)(__m256i)(b), (int)(p), \ + (__mmask8)-1)) + +#define _mm256_mask_cmp_epi32_mask(m, a, b, p) \ + ((__mmask8)__builtin_ia32_cmpd256_mask((__v8si)(__m256i)(a), \ + (__v8si)(__m256i)(b), (int)(p), \ + (__mmask8)(m))) + +#define _mm256_cmp_epu32_mask(a, b, p) \ + ((__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)(__m256i)(a), \ + (__v8si)(__m256i)(b), (int)(p), \ + (__mmask8)-1)) + +#define _mm256_mask_cmp_epu32_mask(m, a, b, p) \ + ((__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)(__m256i)(a), \ + (__v8si)(__m256i)(b), (int)(p), \ + (__mmask8)(m))) + +#define _mm_cmp_epi64_mask(a, b, p) \ + ((__mmask8)__builtin_ia32_cmpq128_mask((__v2di)(__m128i)(a), \ + (__v2di)(__m128i)(b), (int)(p), \ + (__mmask8)-1)) + +#define _mm_mask_cmp_epi64_mask(m, a, b, p) \ + ((__mmask8)__builtin_ia32_cmpq128_mask((__v2di)(__m128i)(a), \ + (__v2di)(__m128i)(b), (int)(p), \ + (__mmask8)(m))) + +#define _mm_cmp_epu64_mask(a, b, p) \ + ((__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)(__m128i)(a), \ + (__v2di)(__m128i)(b), (int)(p), \ + (__mmask8)-1)) + +#define _mm_mask_cmp_epu64_mask(m, a, b, p) \ + ((__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)(__m128i)(a), \ + (__v2di)(__m128i)(b), (int)(p), \ + (__mmask8)(m))) + +#define _mm256_cmp_epi64_mask(a, b, p) \ + ((__mmask8)__builtin_ia32_cmpq256_mask((__v4di)(__m256i)(a), \ + (__v4di)(__m256i)(b), (int)(p), \ + (__mmask8)-1)) + +#define _mm256_mask_cmp_epi64_mask(m, a, b, p) \ + ((__mmask8)__builtin_ia32_cmpq256_mask((__v4di)(__m256i)(a), \ + (__v4di)(__m256i)(b), (int)(p), \ + (__mmask8)(m))) + +#define _mm256_cmp_epu64_mask(a, b, p) \ + ((__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)(__m256i)(a), \ + (__v4di)(__m256i)(b), (int)(p), \ + (__mmask8)-1)) + +#define _mm256_mask_cmp_epu64_mask(m, a, b, p) \ + ((__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)(__m256i)(a), \ + (__v4di)(__m256i)(b), (int)(p), \ + (__mmask8)(m))) + +#define _mm256_cmp_ps_mask(a, b, p) \ + ((__mmask8)__builtin_ia32_cmpps256_mask((__v8sf)(__m256)(a), \ + (__v8sf)(__m256)(b), (int)(p), \ + (__mmask8)-1)) + +#define _mm256_mask_cmp_ps_mask(m, a, b, p) \ + ((__mmask8)__builtin_ia32_cmpps256_mask((__v8sf)(__m256)(a), \ + (__v8sf)(__m256)(b), (int)(p), \ + (__mmask8)(m))) + +#define _mm256_cmp_pd_mask(a, b, p) \ + ((__mmask8)__builtin_ia32_cmppd256_mask((__v4df)(__m256d)(a), \ + (__v4df)(__m256d)(b), (int)(p), \ + (__mmask8)-1)) + +#define _mm256_mask_cmp_pd_mask(m, a, b, p) \ + ((__mmask8)__builtin_ia32_cmppd256_mask((__v4df)(__m256d)(a), \ + (__v4df)(__m256d)(b), (int)(p), \ + (__mmask8)(m))) + +#define _mm_cmp_ps_mask(a, b, p) \ + ((__mmask8)__builtin_ia32_cmpps128_mask((__v4sf)(__m128)(a), \ + (__v4sf)(__m128)(b), (int)(p), \ + (__mmask8)-1)) + +#define _mm_mask_cmp_ps_mask(m, a, b, p) \ + ((__mmask8)__builtin_ia32_cmpps128_mask((__v4sf)(__m128)(a), \ + (__v4sf)(__m128)(b), (int)(p), \ + (__mmask8)(m))) + +#define _mm_cmp_pd_mask(a, b, p) \ + ((__mmask8)__builtin_ia32_cmppd128_mask((__v2df)(__m128d)(a), \ + (__v2df)(__m128d)(b), (int)(p), \ + (__mmask8)-1)) + +#define _mm_mask_cmp_pd_mask(m, a, b, p) \ + ((__mmask8)__builtin_ia32_cmppd128_mask((__v2df)(__m128d)(a), \ + (__v2df)(__m128d)(b), (int)(p), \ + (__mmask8)(m))) + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask_fmadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) +{ + return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, + __builtin_ia32_vfmaddpd ((__v2df) __A, + (__v2df) __B, + (__v2df) __C), + (__v2df) __A); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask3_fmadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) +{ + return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, + __builtin_ia32_vfmaddpd ((__v2df) __A, + (__v2df) __B, + (__v2df) __C), + (__v2df) __C); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_maskz_fmadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, + __builtin_ia32_vfmaddpd ((__v2df) __A, + (__v2df) __B, + (__v2df) __C), + (__v2df)_mm_setzero_pd()); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask_fmsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) +{ + return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, + __builtin_ia32_vfmaddpd ((__v2df) __A, + (__v2df) __B, + -(__v2df) __C), + (__v2df) __A); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_maskz_fmsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, + __builtin_ia32_vfmaddpd ((__v2df) __A, + (__v2df) __B, + -(__v2df) __C), + (__v2df)_mm_setzero_pd()); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask3_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) +{ + return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, + __builtin_ia32_vfmaddpd (-(__v2df) __A, + (__v2df) __B, + (__v2df) __C), + (__v2df) __C); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_maskz_fnmadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, + __builtin_ia32_vfmaddpd (-(__v2df) __A, + (__v2df) __B, + (__v2df) __C), + (__v2df)_mm_setzero_pd()); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_maskz_fnmsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, + __builtin_ia32_vfmaddpd (-(__v2df) __A, + (__v2df) __B, + -(__v2df) __C), + (__v2df)_mm_setzero_pd()); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_mask_fmadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) +{ + return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, + __builtin_ia32_vfmaddpd256 ((__v4df) __A, + (__v4df) __B, + (__v4df) __C), + (__v4df) __A); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_mask3_fmadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) +{ + return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, + __builtin_ia32_vfmaddpd256 ((__v4df) __A, + (__v4df) __B, + (__v4df) __C), + (__v4df) __C); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_maskz_fmadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) +{ + return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, + __builtin_ia32_vfmaddpd256 ((__v4df) __A, + (__v4df) __B, + (__v4df) __C), + (__v4df)_mm256_setzero_pd()); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_mask_fmsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) +{ + return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, + __builtin_ia32_vfmaddpd256 ((__v4df) __A, + (__v4df) __B, + -(__v4df) __C), + (__v4df) __A); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_maskz_fmsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) +{ + return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, + __builtin_ia32_vfmaddpd256 ((__v4df) __A, + (__v4df) __B, + -(__v4df) __C), + (__v4df)_mm256_setzero_pd()); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_mask3_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) +{ + return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, + __builtin_ia32_vfmaddpd256 (-(__v4df) __A, + (__v4df) __B, + (__v4df) __C), + (__v4df) __C); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_maskz_fnmadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) +{ + return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, + __builtin_ia32_vfmaddpd256 (-(__v4df) __A, + (__v4df) __B, + (__v4df) __C), + (__v4df)_mm256_setzero_pd()); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_maskz_fnmsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) +{ + return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, + __builtin_ia32_vfmaddpd256 (-(__v4df) __A, + (__v4df) __B, + -(__v4df) __C), + (__v4df)_mm256_setzero_pd()); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask_fmadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, + __builtin_ia32_vfmaddps ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C), + (__v4sf) __A); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask3_fmadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) +{ + return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, + __builtin_ia32_vfmaddps ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C), + (__v4sf) __C); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_fmadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, + __builtin_ia32_vfmaddps ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C), + (__v4sf)_mm_setzero_ps()); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask_fmsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, + __builtin_ia32_vfmaddps ((__v4sf) __A, + (__v4sf) __B, + -(__v4sf) __C), + (__v4sf) __A); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_fmsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, + __builtin_ia32_vfmaddps ((__v4sf) __A, + (__v4sf) __B, + -(__v4sf) __C), + (__v4sf)_mm_setzero_ps()); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask3_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) +{ + return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, + __builtin_ia32_vfmaddps (-(__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C), + (__v4sf) __C); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_fnmadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, + __builtin_ia32_vfmaddps (-(__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C), + (__v4sf)_mm_setzero_ps()); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_fnmsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, + __builtin_ia32_vfmaddps (-(__v4sf) __A, + (__v4sf) __B, + -(__v4sf) __C), + (__v4sf)_mm_setzero_ps()); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_mask_fmadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) +{ + return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, + __builtin_ia32_vfmaddps256 ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C), + (__v8sf) __A); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_mask3_fmadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) +{ + return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, + __builtin_ia32_vfmaddps256 ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C), + (__v8sf) __C); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_maskz_fmadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C) +{ + return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, + __builtin_ia32_vfmaddps256 ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C), + (__v8sf)_mm256_setzero_ps()); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_mask_fmsub_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) +{ + return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, + __builtin_ia32_vfmaddps256 ((__v8sf) __A, + (__v8sf) __B, + -(__v8sf) __C), + (__v8sf) __A); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_maskz_fmsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C) +{ + return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, + __builtin_ia32_vfmaddps256 ((__v8sf) __A, + (__v8sf) __B, + -(__v8sf) __C), + (__v8sf)_mm256_setzero_ps()); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_mask3_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) +{ + return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, + __builtin_ia32_vfmaddps256 (-(__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C), + (__v8sf) __C); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_maskz_fnmadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C) +{ + return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, + __builtin_ia32_vfmaddps256 (-(__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C), + (__v8sf)_mm256_setzero_ps()); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_maskz_fnmsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C) +{ + return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, + __builtin_ia32_vfmaddps256 (-(__v8sf) __A, + (__v8sf) __B, + -(__v8sf) __C), + (__v8sf)_mm256_setzero_ps()); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask_fmaddsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) +{ + return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, + __builtin_ia32_vfmaddsubpd ((__v2df) __A, + (__v2df) __B, + (__v2df) __C), + (__v2df) __A); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask3_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) +{ + return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, + __builtin_ia32_vfmaddsubpd ((__v2df) __A, + (__v2df) __B, + (__v2df) __C), + (__v2df) __C); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_maskz_fmaddsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, + __builtin_ia32_vfmaddsubpd ((__v2df) __A, + (__v2df) __B, + (__v2df) __C), + (__v2df)_mm_setzero_pd()); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask_fmsubadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) +{ + return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, + __builtin_ia32_vfmaddsubpd ((__v2df) __A, + (__v2df) __B, + -(__v2df) __C), + (__v2df) __A); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_maskz_fmsubadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, + __builtin_ia32_vfmaddsubpd ((__v2df) __A, + (__v2df) __B, + -(__v2df) __C), + (__v2df)_mm_setzero_pd()); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_mask_fmaddsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) +{ + return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, + __builtin_ia32_vfmaddsubpd256 ((__v4df) __A, + (__v4df) __B, + (__v4df) __C), + (__v4df) __A); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_mask3_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) +{ + return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, + __builtin_ia32_vfmaddsubpd256 ((__v4df) __A, + (__v4df) __B, + (__v4df) __C), + (__v4df) __C); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_maskz_fmaddsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) +{ + return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, + __builtin_ia32_vfmaddsubpd256 ((__v4df) __A, + (__v4df) __B, + (__v4df) __C), + (__v4df)_mm256_setzero_pd()); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_mask_fmsubadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) +{ + return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, + __builtin_ia32_vfmaddsubpd256 ((__v4df) __A, + (__v4df) __B, + -(__v4df) __C), + (__v4df) __A); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_maskz_fmsubadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) +{ + return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, + __builtin_ia32_vfmaddsubpd256 ((__v4df) __A, + (__v4df) __B, + -(__v4df) __C), + (__v4df)_mm256_setzero_pd()); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask_fmaddsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, + __builtin_ia32_vfmaddsubps ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C), + (__v4sf) __A); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask3_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) +{ + return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, + __builtin_ia32_vfmaddsubps ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C), + (__v4sf) __C); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_fmaddsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, + __builtin_ia32_vfmaddsubps ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __C), + (__v4sf)_mm_setzero_ps()); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask_fmsubadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, + __builtin_ia32_vfmaddsubps ((__v4sf) __A, + (__v4sf) __B, + -(__v4sf) __C), + (__v4sf) __A); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_fmsubadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, + __builtin_ia32_vfmaddsubps ((__v4sf) __A, + (__v4sf) __B, + -(__v4sf) __C), + (__v4sf)_mm_setzero_ps()); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_mask_fmaddsub_ps(__m256 __A, __mmask8 __U, __m256 __B, + __m256 __C) +{ + return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, + __builtin_ia32_vfmaddsubps256 ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C), + (__v8sf) __A); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_mask3_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) +{ + return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, + __builtin_ia32_vfmaddsubps256 ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C), + (__v8sf) __C); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_maskz_fmaddsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C) +{ + return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, + __builtin_ia32_vfmaddsubps256 ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __C), + (__v8sf)_mm256_setzero_ps()); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_mask_fmsubadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) +{ + return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, + __builtin_ia32_vfmaddsubps256 ((__v8sf) __A, + (__v8sf) __B, + -(__v8sf) __C), + (__v8sf) __A); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_maskz_fmsubadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C) +{ + return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, + __builtin_ia32_vfmaddsubps256 ((__v8sf) __A, + (__v8sf) __B, + -(__v8sf) __C), + (__v8sf)_mm256_setzero_ps()); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask3_fmsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) +{ + return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, + __builtin_ia32_vfmaddpd ((__v2df) __A, + (__v2df) __B, + -(__v2df) __C), + (__v2df) __C); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_mask3_fmsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) +{ + return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, + __builtin_ia32_vfmaddpd256 ((__v4df) __A, + (__v4df) __B, + -(__v4df) __C), + (__v4df) __C); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask3_fmsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) +{ + return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, + __builtin_ia32_vfmaddps ((__v4sf) __A, + (__v4sf) __B, + -(__v4sf) __C), + (__v4sf) __C); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_mask3_fmsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) +{ + return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, + __builtin_ia32_vfmaddps256 ((__v8sf) __A, + (__v8sf) __B, + -(__v8sf) __C), + (__v8sf) __C); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask3_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) +{ + return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, + __builtin_ia32_vfmaddsubpd ((__v2df) __A, + (__v2df) __B, + -(__v2df) __C), + (__v2df) __C); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_mask3_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) +{ + return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, + __builtin_ia32_vfmaddsubpd256 ((__v4df) __A, + (__v4df) __B, + -(__v4df) __C), + (__v4df) __C); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask3_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) +{ + return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, + __builtin_ia32_vfmaddsubps ((__v4sf) __A, + (__v4sf) __B, + -(__v4sf) __C), + (__v4sf) __C); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_mask3_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) +{ + return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, + __builtin_ia32_vfmaddsubps256 ((__v8sf) __A, + (__v8sf) __B, + -(__v8sf) __C), + (__v8sf) __C); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask_fnmadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) +{ + return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, + __builtin_ia32_vfmaddpd ((__v2df) __A, + -(__v2df) __B, + (__v2df) __C), + (__v2df) __A); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_mask_fnmadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) +{ + return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, + __builtin_ia32_vfmaddpd256 ((__v4df) __A, + -(__v4df) __B, + (__v4df) __C), + (__v4df) __A); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask_fnmadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, + __builtin_ia32_vfmaddps ((__v4sf) __A, + -(__v4sf) __B, + (__v4sf) __C), + (__v4sf) __A); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_mask_fnmadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) +{ + return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, + __builtin_ia32_vfmaddps256 ((__v8sf) __A, + -(__v8sf) __B, + (__v8sf) __C), + (__v8sf) __A); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask_fnmsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) +{ + return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, + __builtin_ia32_vfmaddpd ((__v2df) __A, + -(__v2df) __B, + -(__v2df) __C), + (__v2df) __A); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask3_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) +{ + return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, + __builtin_ia32_vfmaddpd ((__v2df) __A, + -(__v2df) __B, + -(__v2df) __C), + (__v2df) __C); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_mask_fnmsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) +{ + return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, + __builtin_ia32_vfmaddpd256 ((__v4df) __A, + -(__v4df) __B, + -(__v4df) __C), + (__v4df) __A); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_mask3_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) +{ + return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, + __builtin_ia32_vfmaddpd256 ((__v4df) __A, + -(__v4df) __B, + -(__v4df) __C), + (__v4df) __C); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask_fnmsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) +{ + return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, + __builtin_ia32_vfmaddps ((__v4sf) __A, + -(__v4sf) __B, + -(__v4sf) __C), + (__v4sf) __A); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask3_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) +{ + return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, + __builtin_ia32_vfmaddps ((__v4sf) __A, + -(__v4sf) __B, + -(__v4sf) __C), + (__v4sf) __C); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_mask_fnmsub_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) +{ + return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, + __builtin_ia32_vfmaddps256 ((__v8sf) __A, + -(__v8sf) __B, + -(__v8sf) __C), + (__v8sf) __A); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_mask3_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) +{ + return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, + __builtin_ia32_vfmaddps256 ((__v8sf) __A, + -(__v8sf) __B, + -(__v8sf) __C), + (__v8sf) __C); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask_add_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_add_pd(__A, __B), + (__v2df)__W); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_maskz_add_pd(__mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_add_pd(__A, __B), + (__v2df)_mm_setzero_pd()); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_mask_add_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_add_pd(__A, __B), + (__v4df)__W); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_maskz_add_pd(__mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_add_pd(__A, __B), + (__v4df)_mm256_setzero_pd()); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask_add_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_add_ps(__A, __B), + (__v4sf)__W); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_add_ps(__mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_add_ps(__A, __B), + (__v4sf)_mm_setzero_ps()); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_mask_add_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_add_ps(__A, __B), + (__v8sf)__W); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_maskz_add_ps(__mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_add_ps(__A, __B), + (__v8sf)_mm256_setzero_ps()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_blend_epi32 (__mmask8 __U, __m128i __A, __m128i __W) { + return (__m128i) __builtin_ia32_selectd_128 ((__mmask8) __U, + (__v4si) __W, + (__v4si) __A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_blend_epi32 (__mmask8 __U, __m256i __A, __m256i __W) { + return (__m256i) __builtin_ia32_selectd_256 ((__mmask8) __U, + (__v8si) __W, + (__v8si) __A); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask_blend_pd (__mmask8 __U, __m128d __A, __m128d __W) { + return (__m128d) __builtin_ia32_selectpd_128 ((__mmask8) __U, + (__v2df) __W, + (__v2df) __A); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_mask_blend_pd (__mmask8 __U, __m256d __A, __m256d __W) { + return (__m256d) __builtin_ia32_selectpd_256 ((__mmask8) __U, + (__v4df) __W, + (__v4df) __A); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask_blend_ps (__mmask8 __U, __m128 __A, __m128 __W) { + return (__m128) __builtin_ia32_selectps_128 ((__mmask8) __U, + (__v4sf) __W, + (__v4sf) __A); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_mask_blend_ps (__mmask8 __U, __m256 __A, __m256 __W) { + return (__m256) __builtin_ia32_selectps_256 ((__mmask8) __U, + (__v8sf) __W, + (__v8sf) __A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_blend_epi64 (__mmask8 __U, __m128i __A, __m128i __W) { + return (__m128i) __builtin_ia32_selectq_128 ((__mmask8) __U, + (__v2di) __W, + (__v2di) __A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_blend_epi64 (__mmask8 __U, __m256i __A, __m256i __W) { + return (__m256i) __builtin_ia32_selectq_256 ((__mmask8) __U, + (__v4di) __W, + (__v4di) __A); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask_compress_pd (__m128d __W, __mmask8 __U, __m128d __A) { + return (__m128d) __builtin_ia32_compressdf128_mask ((__v2df) __A, + (__v2df) __W, + (__mmask8) __U); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_maskz_compress_pd (__mmask8 __U, __m128d __A) { + return (__m128d) __builtin_ia32_compressdf128_mask ((__v2df) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_mask_compress_pd (__m256d __W, __mmask8 __U, __m256d __A) { + return (__m256d) __builtin_ia32_compressdf256_mask ((__v4df) __A, + (__v4df) __W, + (__mmask8) __U); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_maskz_compress_pd (__mmask8 __U, __m256d __A) { + return (__m256d) __builtin_ia32_compressdf256_mask ((__v4df) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_compress_epi64 (__m128i __W, __mmask8 __U, __m128i __A) { + return (__m128i) __builtin_ia32_compressdi128_mask ((__v2di) __A, + (__v2di) __W, + (__mmask8) __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_compress_epi64 (__mmask8 __U, __m128i __A) { + return (__m128i) __builtin_ia32_compressdi128_mask ((__v2di) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_compress_epi64 (__m256i __W, __mmask8 __U, __m256i __A) { + return (__m256i) __builtin_ia32_compressdi256_mask ((__v4di) __A, + (__v4di) __W, + (__mmask8) __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_compress_epi64 (__mmask8 __U, __m256i __A) { + return (__m256i) __builtin_ia32_compressdi256_mask ((__v4di) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask_compress_ps (__m128 __W, __mmask8 __U, __m128 __A) { + return (__m128) __builtin_ia32_compresssf128_mask ((__v4sf) __A, + (__v4sf) __W, + (__mmask8) __U); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_compress_ps (__mmask8 __U, __m128 __A) { + return (__m128) __builtin_ia32_compresssf128_mask ((__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_mask_compress_ps (__m256 __W, __mmask8 __U, __m256 __A) { + return (__m256) __builtin_ia32_compresssf256_mask ((__v8sf) __A, + (__v8sf) __W, + (__mmask8) __U); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_maskz_compress_ps (__mmask8 __U, __m256 __A) { + return (__m256) __builtin_ia32_compresssf256_mask ((__v8sf) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_compress_epi32 (__m128i __W, __mmask8 __U, __m128i __A) { + return (__m128i) __builtin_ia32_compresssi128_mask ((__v4si) __A, + (__v4si) __W, + (__mmask8) __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_compress_epi32 (__mmask8 __U, __m128i __A) { + return (__m128i) __builtin_ia32_compresssi128_mask ((__v4si) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_compress_epi32 (__m256i __W, __mmask8 __U, __m256i __A) { + return (__m256i) __builtin_ia32_compresssi256_mask ((__v8si) __A, + (__v8si) __W, + (__mmask8) __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_compress_epi32 (__mmask8 __U, __m256i __A) { + return (__m256i) __builtin_ia32_compresssi256_mask ((__v8si) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +static __inline__ void __DEFAULT_FN_ATTRS128 +_mm_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m128d __A) { + __builtin_ia32_compressstoredf128_mask ((__v2df *) __P, + (__v2df) __A, + (__mmask8) __U); +} + +static __inline__ void __DEFAULT_FN_ATTRS256 +_mm256_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m256d __A) { + __builtin_ia32_compressstoredf256_mask ((__v4df *) __P, + (__v4df) __A, + (__mmask8) __U); +} + +static __inline__ void __DEFAULT_FN_ATTRS128 +_mm_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m128i __A) { + __builtin_ia32_compressstoredi128_mask ((__v2di *) __P, + (__v2di) __A, + (__mmask8) __U); +} + +static __inline__ void __DEFAULT_FN_ATTRS256 +_mm256_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m256i __A) { + __builtin_ia32_compressstoredi256_mask ((__v4di *) __P, + (__v4di) __A, + (__mmask8) __U); +} + +static __inline__ void __DEFAULT_FN_ATTRS128 +_mm_mask_compressstoreu_ps (void *__P, __mmask8 __U, __m128 __A) { + __builtin_ia32_compressstoresf128_mask ((__v4sf *) __P, + (__v4sf) __A, + (__mmask8) __U); +} + +static __inline__ void __DEFAULT_FN_ATTRS256 +_mm256_mask_compressstoreu_ps (void *__P, __mmask8 __U, __m256 __A) { + __builtin_ia32_compressstoresf256_mask ((__v8sf *) __P, + (__v8sf) __A, + (__mmask8) __U); +} + +static __inline__ void __DEFAULT_FN_ATTRS128 +_mm_mask_compressstoreu_epi32 (void *__P, __mmask8 __U, __m128i __A) { + __builtin_ia32_compressstoresi128_mask ((__v4si *) __P, + (__v4si) __A, + (__mmask8) __U); +} + +static __inline__ void __DEFAULT_FN_ATTRS256 +_mm256_mask_compressstoreu_epi32 (void *__P, __mmask8 __U, __m256i __A) { + __builtin_ia32_compressstoresi256_mask ((__v8si *) __P, + (__v8si) __A, + (__mmask8) __U); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask_cvtepi32_pd (__m128d __W, __mmask8 __U, __m128i __A) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8) __U, + (__v2df)_mm_cvtepi32_pd(__A), + (__v2df)__W); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtepi32_pd (__mmask8 __U, __m128i __A) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8) __U, + (__v2df)_mm_cvtepi32_pd(__A), + (__v2df)_mm_setzero_pd()); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtepi32_pd (__m256d __W, __mmask8 __U, __m128i __A) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8) __U, + (__v4df)_mm256_cvtepi32_pd(__A), + (__v4df)__W); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtepi32_pd (__mmask8 __U, __m128i __A) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8) __U, + (__v4df)_mm256_cvtepi32_pd(__A), + (__v4df)_mm256_setzero_pd()); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask_cvtepi32_ps (__m128 __W, __mmask8 __U, __m128i __A) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_cvtepi32_ps(__A), + (__v4sf)__W); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtepi32_ps (__mmask8 __U, __m128i __A) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_cvtepi32_ps(__A), + (__v4sf)_mm_setzero_ps()); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtepi32_ps (__m256 __W, __mmask8 __U, __m256i __A) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_cvtepi32_ps(__A), + (__v8sf)__W); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtepi32_ps (__mmask8 __U, __m256i __A) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_cvtepi32_ps(__A), + (__v8sf)_mm256_setzero_ps()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvtpd_epi32 (__m128i __W, __mmask8 __U, __m128d __A) { + return (__m128i) __builtin_ia32_cvtpd2dq128_mask ((__v2df) __A, + (__v4si) __W, + (__mmask8) __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtpd_epi32 (__mmask8 __U, __m128d __A) { + return (__m128i) __builtin_ia32_cvtpd2dq128_mask ((__v2df) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtpd_epi32 (__m128i __W, __mmask8 __U, __m256d __A) { + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm256_cvtpd_epi32(__A), + (__v4si)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtpd_epi32 (__mmask8 __U, __m256d __A) { + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm256_cvtpd_epi32(__A), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask_cvtpd_ps (__m128 __W, __mmask8 __U, __m128d __A) { + return (__m128) __builtin_ia32_cvtpd2ps_mask ((__v2df) __A, + (__v4sf) __W, + (__mmask8) __U); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtpd_ps (__mmask8 __U, __m128d __A) { + return (__m128) __builtin_ia32_cvtpd2ps_mask ((__v2df) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtpd_ps (__m128 __W, __mmask8 __U, __m256d __A) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm256_cvtpd_ps(__A), + (__v4sf)__W); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtpd_ps (__mmask8 __U, __m256d __A) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm256_cvtpd_ps(__A), + (__v4sf)_mm_setzero_ps()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_cvtpd_epu32 (__m128d __A) { + return (__m128i) __builtin_ia32_cvtpd2udq128_mask ((__v2df) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvtpd_epu32 (__m128i __W, __mmask8 __U, __m128d __A) { + return (__m128i) __builtin_ia32_cvtpd2udq128_mask ((__v2df) __A, + (__v4si) __W, + (__mmask8) __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtpd_epu32 (__mmask8 __U, __m128d __A) { + return (__m128i) __builtin_ia32_cvtpd2udq128_mask ((__v2df) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_cvtpd_epu32 (__m256d __A) { + return (__m128i) __builtin_ia32_cvtpd2udq256_mask ((__v4df) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtpd_epu32 (__m128i __W, __mmask8 __U, __m256d __A) { + return (__m128i) __builtin_ia32_cvtpd2udq256_mask ((__v4df) __A, + (__v4si) __W, + (__mmask8) __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtpd_epu32 (__mmask8 __U, __m256d __A) { + return (__m128i) __builtin_ia32_cvtpd2udq256_mask ((__v4df) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvtps_epi32 (__m128i __W, __mmask8 __U, __m128 __A) { + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_cvtps_epi32(__A), + (__v4si)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtps_epi32 (__mmask8 __U, __m128 __A) { + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_cvtps_epi32(__A), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtps_epi32 (__m256i __W, __mmask8 __U, __m256 __A) { + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_cvtps_epi32(__A), + (__v8si)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtps_epi32 (__mmask8 __U, __m256 __A) { + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_cvtps_epi32(__A), + (__v8si)_mm256_setzero_si256()); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask_cvtps_pd (__m128d __W, __mmask8 __U, __m128 __A) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_cvtps_pd(__A), + (__v2df)__W); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtps_pd (__mmask8 __U, __m128 __A) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_cvtps_pd(__A), + (__v2df)_mm_setzero_pd()); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtps_pd (__m256d __W, __mmask8 __U, __m128 __A) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_cvtps_pd(__A), + (__v4df)__W); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtps_pd (__mmask8 __U, __m128 __A) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_cvtps_pd(__A), + (__v4df)_mm256_setzero_pd()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_cvtps_epu32 (__m128 __A) { + return (__m128i) __builtin_ia32_cvtps2udq128_mask ((__v4sf) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvtps_epu32 (__m128i __W, __mmask8 __U, __m128 __A) { + return (__m128i) __builtin_ia32_cvtps2udq128_mask ((__v4sf) __A, + (__v4si) __W, + (__mmask8) __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtps_epu32 (__mmask8 __U, __m128 __A) { + return (__m128i) __builtin_ia32_cvtps2udq128_mask ((__v4sf) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cvtps_epu32 (__m256 __A) { + return (__m256i) __builtin_ia32_cvtps2udq256_mask ((__v8sf) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) -1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtps_epu32 (__m256i __W, __mmask8 __U, __m256 __A) { + return (__m256i) __builtin_ia32_cvtps2udq256_mask ((__v8sf) __A, + (__v8si) __W, + (__mmask8) __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtps_epu32 (__mmask8 __U, __m256 __A) { + return (__m256i) __builtin_ia32_cvtps2udq256_mask ((__v8sf) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvttpd_epi32 (__m128i __W, __mmask8 __U, __m128d __A) { + return (__m128i) __builtin_ia32_cvttpd2dq128_mask ((__v2df) __A, + (__v4si) __W, + (__mmask8) __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvttpd_epi32 (__mmask8 __U, __m128d __A) { + return (__m128i) __builtin_ia32_cvttpd2dq128_mask ((__v2df) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvttpd_epi32 (__m128i __W, __mmask8 __U, __m256d __A) { + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm256_cvttpd_epi32(__A), + (__v4si)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvttpd_epi32 (__mmask8 __U, __m256d __A) { + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm256_cvttpd_epi32(__A), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_cvttpd_epu32 (__m128d __A) { + return (__m128i) __builtin_ia32_cvttpd2udq128_mask ((__v2df) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvttpd_epu32 (__m128i __W, __mmask8 __U, __m128d __A) { + return (__m128i) __builtin_ia32_cvttpd2udq128_mask ((__v2df) __A, + (__v4si) __W, + (__mmask8) __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvttpd_epu32 (__mmask8 __U, __m128d __A) { + return (__m128i) __builtin_ia32_cvttpd2udq128_mask ((__v2df) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_cvttpd_epu32 (__m256d __A) { + return (__m128i) __builtin_ia32_cvttpd2udq256_mask ((__v4df) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvttpd_epu32 (__m128i __W, __mmask8 __U, __m256d __A) { + return (__m128i) __builtin_ia32_cvttpd2udq256_mask ((__v4df) __A, + (__v4si) __W, + (__mmask8) __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvttpd_epu32 (__mmask8 __U, __m256d __A) { + return (__m128i) __builtin_ia32_cvttpd2udq256_mask ((__v4df) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvttps_epi32 (__m128i __W, __mmask8 __U, __m128 __A) { + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_cvttps_epi32(__A), + (__v4si)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvttps_epi32 (__mmask8 __U, __m128 __A) { + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_cvttps_epi32(__A), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvttps_epi32 (__m256i __W, __mmask8 __U, __m256 __A) { + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_cvttps_epi32(__A), + (__v8si)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvttps_epi32 (__mmask8 __U, __m256 __A) { + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_cvttps_epi32(__A), + (__v8si)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_cvttps_epu32 (__m128 __A) { + return (__m128i) __builtin_ia32_cvttps2udq128_mask ((__v4sf) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvttps_epu32 (__m128i __W, __mmask8 __U, __m128 __A) { + return (__m128i) __builtin_ia32_cvttps2udq128_mask ((__v4sf) __A, + (__v4si) __W, + (__mmask8) __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvttps_epu32 (__mmask8 __U, __m128 __A) { + return (__m128i) __builtin_ia32_cvttps2udq128_mask ((__v4sf) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cvttps_epu32 (__m256 __A) { + return (__m256i) __builtin_ia32_cvttps2udq256_mask ((__v8sf) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) -1); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvttps_epu32 (__m256i __W, __mmask8 __U, __m256 __A) { + return (__m256i) __builtin_ia32_cvttps2udq256_mask ((__v8sf) __A, + (__v8si) __W, + (__mmask8) __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvttps_epu32 (__mmask8 __U, __m256 __A) { + return (__m256i) __builtin_ia32_cvttps2udq256_mask ((__v8sf) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_cvtepu32_pd (__m128i __A) { + return (__m128d) __builtin_convertvector( + __builtin_shufflevector((__v4su)__A, (__v4su)__A, 0, 1), __v2df); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask_cvtepu32_pd (__m128d __W, __mmask8 __U, __m128i __A) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8) __U, + (__v2df)_mm_cvtepu32_pd(__A), + (__v2df)__W); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtepu32_pd (__mmask8 __U, __m128i __A) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8) __U, + (__v2df)_mm_cvtepu32_pd(__A), + (__v2df)_mm_setzero_pd()); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_cvtepu32_pd (__m128i __A) { + return (__m256d)__builtin_convertvector((__v4su)__A, __v4df); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtepu32_pd (__m256d __W, __mmask8 __U, __m128i __A) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8) __U, + (__v4df)_mm256_cvtepu32_pd(__A), + (__v4df)__W); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtepu32_pd (__mmask8 __U, __m128i __A) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8) __U, + (__v4df)_mm256_cvtepu32_pd(__A), + (__v4df)_mm256_setzero_pd()); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_cvtepu32_ps (__m128i __A) { + return (__m128)__builtin_convertvector((__v4su)__A, __v4sf); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask_cvtepu32_ps (__m128 __W, __mmask8 __U, __m128i __A) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_cvtepu32_ps(__A), + (__v4sf)__W); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtepu32_ps (__mmask8 __U, __m128i __A) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_cvtepu32_ps(__A), + (__v4sf)_mm_setzero_ps()); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_cvtepu32_ps (__m256i __A) { + return (__m256)__builtin_convertvector((__v8su)__A, __v8sf); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtepu32_ps (__m256 __W, __mmask8 __U, __m256i __A) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_cvtepu32_ps(__A), + (__v8sf)__W); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtepu32_ps (__mmask8 __U, __m256i __A) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_cvtepu32_ps(__A), + (__v8sf)_mm256_setzero_ps()); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask_div_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_div_pd(__A, __B), + (__v2df)__W); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_maskz_div_pd(__mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_div_pd(__A, __B), + (__v2df)_mm_setzero_pd()); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_mask_div_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_div_pd(__A, __B), + (__v4df)__W); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_maskz_div_pd(__mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_div_pd(__A, __B), + (__v4df)_mm256_setzero_pd()); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask_div_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_div_ps(__A, __B), + (__v4sf)__W); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_div_ps(__mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_div_ps(__A, __B), + (__v4sf)_mm_setzero_ps()); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_mask_div_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_div_ps(__A, __B), + (__v8sf)__W); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_maskz_div_ps(__mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_div_ps(__A, __B), + (__v8sf)_mm256_setzero_ps()); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask_expand_pd (__m128d __W, __mmask8 __U, __m128d __A) { + return (__m128d) __builtin_ia32_expanddf128_mask ((__v2df) __A, + (__v2df) __W, + (__mmask8) __U); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_maskz_expand_pd (__mmask8 __U, __m128d __A) { + return (__m128d) __builtin_ia32_expanddf128_mask ((__v2df) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_mask_expand_pd (__m256d __W, __mmask8 __U, __m256d __A) { + return (__m256d) __builtin_ia32_expanddf256_mask ((__v4df) __A, + (__v4df) __W, + (__mmask8) __U); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_maskz_expand_pd (__mmask8 __U, __m256d __A) { + return (__m256d) __builtin_ia32_expanddf256_mask ((__v4df) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_expand_epi64 (__m128i __W, __mmask8 __U, __m128i __A) { + return (__m128i) __builtin_ia32_expanddi128_mask ((__v2di) __A, + (__v2di) __W, + (__mmask8) __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_expand_epi64 (__mmask8 __U, __m128i __A) { + return (__m128i) __builtin_ia32_expanddi128_mask ((__v2di) __A, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_expand_epi64 (__m256i __W, __mmask8 __U, __m256i __A) { + return (__m256i) __builtin_ia32_expanddi256_mask ((__v4di) __A, + (__v4di) __W, + (__mmask8) __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_expand_epi64 (__mmask8 __U, __m256i __A) { + return (__m256i) __builtin_ia32_expanddi256_mask ((__v4di) __A, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask_expandloadu_pd (__m128d __W, __mmask8 __U, void const *__P) { + return (__m128d) __builtin_ia32_expandloaddf128_mask ((const __v2df *) __P, + (__v2df) __W, + (__mmask8) + __U); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_maskz_expandloadu_pd (__mmask8 __U, void const *__P) { + return (__m128d) __builtin_ia32_expandloaddf128_mask ((const __v2df *) __P, + (__v2df) + _mm_setzero_pd (), + (__mmask8) + __U); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_mask_expandloadu_pd (__m256d __W, __mmask8 __U, void const *__P) { + return (__m256d) __builtin_ia32_expandloaddf256_mask ((const __v4df *) __P, + (__v4df) __W, + (__mmask8) + __U); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_maskz_expandloadu_pd (__mmask8 __U, void const *__P) { + return (__m256d) __builtin_ia32_expandloaddf256_mask ((const __v4df *) __P, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_expandloadu_epi64 (__m128i __W, __mmask8 __U, void const *__P) { + return (__m128i) __builtin_ia32_expandloaddi128_mask ((const __v2di *) __P, + (__v2di) __W, + (__mmask8) + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_expandloadu_epi64 (__mmask8 __U, void const *__P) { + return (__m128i) __builtin_ia32_expandloaddi128_mask ((const __v2di *) __P, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_expandloadu_epi64 (__m256i __W, __mmask8 __U, + void const *__P) { + return (__m256i) __builtin_ia32_expandloaddi256_mask ((const __v4di *) __P, + (__v4di) __W, + (__mmask8) + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_expandloadu_epi64 (__mmask8 __U, void const *__P) { + return (__m256i) __builtin_ia32_expandloaddi256_mask ((const __v4di *) __P, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) + __U); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask_expandloadu_ps (__m128 __W, __mmask8 __U, void const *__P) { + return (__m128) __builtin_ia32_expandloadsf128_mask ((const __v4sf *) __P, + (__v4sf) __W, + (__mmask8) __U); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_expandloadu_ps (__mmask8 __U, void const *__P) { + return (__m128) __builtin_ia32_expandloadsf128_mask ((const __v4sf *) __P, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) + __U); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_mask_expandloadu_ps (__m256 __W, __mmask8 __U, void const *__P) { + return (__m256) __builtin_ia32_expandloadsf256_mask ((const __v8sf *) __P, + (__v8sf) __W, + (__mmask8) __U); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_maskz_expandloadu_ps (__mmask8 __U, void const *__P) { + return (__m256) __builtin_ia32_expandloadsf256_mask ((const __v8sf *) __P, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_expandloadu_epi32 (__m128i __W, __mmask8 __U, void const *__P) { + return (__m128i) __builtin_ia32_expandloadsi128_mask ((const __v4si *) __P, + (__v4si) __W, + (__mmask8) + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_expandloadu_epi32 (__mmask8 __U, void const *__P) { + return (__m128i) __builtin_ia32_expandloadsi128_mask ((const __v4si *) __P, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_expandloadu_epi32 (__m256i __W, __mmask8 __U, + void const *__P) { + return (__m256i) __builtin_ia32_expandloadsi256_mask ((const __v8si *) __P, + (__v8si) __W, + (__mmask8) + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_expandloadu_epi32 (__mmask8 __U, void const *__P) { + return (__m256i) __builtin_ia32_expandloadsi256_mask ((const __v8si *) __P, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) + __U); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask_expand_ps (__m128 __W, __mmask8 __U, __m128 __A) { + return (__m128) __builtin_ia32_expandsf128_mask ((__v4sf) __A, + (__v4sf) __W, + (__mmask8) __U); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_expand_ps (__mmask8 __U, __m128 __A) { + return (__m128) __builtin_ia32_expandsf128_mask ((__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_mask_expand_ps (__m256 __W, __mmask8 __U, __m256 __A) { + return (__m256) __builtin_ia32_expandsf256_mask ((__v8sf) __A, + (__v8sf) __W, + (__mmask8) __U); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_maskz_expand_ps (__mmask8 __U, __m256 __A) { + return (__m256) __builtin_ia32_expandsf256_mask ((__v8sf) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_expand_epi32 (__m128i __W, __mmask8 __U, __m128i __A) { + return (__m128i) __builtin_ia32_expandsi128_mask ((__v4si) __A, + (__v4si) __W, + (__mmask8) __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_expand_epi32 (__mmask8 __U, __m128i __A) { + return (__m128i) __builtin_ia32_expandsi128_mask ((__v4si) __A, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_expand_epi32 (__m256i __W, __mmask8 __U, __m256i __A) { + return (__m256i) __builtin_ia32_expandsi256_mask ((__v8si) __A, + (__v8si) __W, + (__mmask8) __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_expand_epi32 (__mmask8 __U, __m256i __A) { + return (__m256i) __builtin_ia32_expandsi256_mask ((__v8si) __A, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_getexp_pd (__m128d __A) { + return (__m128d) __builtin_ia32_getexppd128_mask ((__v2df) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask_getexp_pd (__m128d __W, __mmask8 __U, __m128d __A) { + return (__m128d) __builtin_ia32_getexppd128_mask ((__v2df) __A, + (__v2df) __W, + (__mmask8) __U); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_maskz_getexp_pd (__mmask8 __U, __m128d __A) { + return (__m128d) __builtin_ia32_getexppd128_mask ((__v2df) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_getexp_pd (__m256d __A) { + return (__m256d) __builtin_ia32_getexppd256_mask ((__v4df) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) -1); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_mask_getexp_pd (__m256d __W, __mmask8 __U, __m256d __A) { + return (__m256d) __builtin_ia32_getexppd256_mask ((__v4df) __A, + (__v4df) __W, + (__mmask8) __U); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_maskz_getexp_pd (__mmask8 __U, __m256d __A) { + return (__m256d) __builtin_ia32_getexppd256_mask ((__v4df) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_getexp_ps (__m128 __A) { + return (__m128) __builtin_ia32_getexpps128_mask ((__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask_getexp_ps (__m128 __W, __mmask8 __U, __m128 __A) { + return (__m128) __builtin_ia32_getexpps128_mask ((__v4sf) __A, + (__v4sf) __W, + (__mmask8) __U); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_getexp_ps (__mmask8 __U, __m128 __A) { + return (__m128) __builtin_ia32_getexpps128_mask ((__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_getexp_ps (__m256 __A) { + return (__m256) __builtin_ia32_getexpps256_mask ((__v8sf) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) -1); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_mask_getexp_ps (__m256 __W, __mmask8 __U, __m256 __A) { + return (__m256) __builtin_ia32_getexpps256_mask ((__v8sf) __A, + (__v8sf) __W, + (__mmask8) __U); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_maskz_getexp_ps (__mmask8 __U, __m256 __A) { + return (__m256) __builtin_ia32_getexpps256_mask ((__v8sf) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask_max_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_max_pd(__A, __B), + (__v2df)__W); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_maskz_max_pd(__mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_max_pd(__A, __B), + (__v2df)_mm_setzero_pd()); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_mask_max_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_max_pd(__A, __B), + (__v4df)__W); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_maskz_max_pd(__mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_max_pd(__A, __B), + (__v4df)_mm256_setzero_pd()); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask_max_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_max_ps(__A, __B), + (__v4sf)__W); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_max_ps(__mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_max_ps(__A, __B), + (__v4sf)_mm_setzero_ps()); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_mask_max_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_max_ps(__A, __B), + (__v8sf)__W); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_maskz_max_ps(__mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_max_ps(__A, __B), + (__v8sf)_mm256_setzero_ps()); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask_min_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_min_pd(__A, __B), + (__v2df)__W); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_maskz_min_pd(__mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_min_pd(__A, __B), + (__v2df)_mm_setzero_pd()); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_mask_min_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_min_pd(__A, __B), + (__v4df)__W); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_maskz_min_pd(__mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_min_pd(__A, __B), + (__v4df)_mm256_setzero_pd()); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask_min_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_min_ps(__A, __B), + (__v4sf)__W); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_min_ps(__mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_min_ps(__A, __B), + (__v4sf)_mm_setzero_ps()); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_mask_min_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_min_ps(__A, __B), + (__v8sf)__W); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_maskz_min_ps(__mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_min_ps(__A, __B), + (__v8sf)_mm256_setzero_ps()); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask_mul_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_mul_pd(__A, __B), + (__v2df)__W); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_maskz_mul_pd(__mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_mul_pd(__A, __B), + (__v2df)_mm_setzero_pd()); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_mask_mul_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_mul_pd(__A, __B), + (__v4df)__W); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_maskz_mul_pd(__mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_mul_pd(__A, __B), + (__v4df)_mm256_setzero_pd()); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask_mul_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_mul_ps(__A, __B), + (__v4sf)__W); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_mul_ps(__mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_mul_ps(__A, __B), + (__v4sf)_mm_setzero_ps()); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_mask_mul_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_mul_ps(__A, __B), + (__v8sf)__W); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_maskz_mul_ps(__mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_mul_ps(__A, __B), + (__v8sf)_mm256_setzero_ps()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_abs_epi32(__m128i __W, __mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_abs_epi32(__A), + (__v4si)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_abs_epi32(__mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_abs_epi32(__A), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_abs_epi32(__m256i __W, __mmask8 __U, __m256i __A) { + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_abs_epi32(__A), + (__v8si)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_abs_epi32(__mmask8 __U, __m256i __A) { + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_abs_epi32(__A), + (__v8si)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_abs_epi64 (__m128i __A) { +#if (__clang_major__ < 14) + return (__m128i)__builtin_ia32_pabsq128((__v2di)__A); +#else + return (__m128i)__builtin_elementwise_abs((__v2di)__A); +#endif +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_abs_epi64 (__m128i __W, __mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_abs_epi64(__A), + (__v2di)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_abs_epi64 (__mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_abs_epi64(__A), + (__v2di)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_abs_epi64 (__m256i __A) { +#if (__clang_major__ < 14) + return (__m256i)__builtin_ia32_pabsq256 ((__v4di)__A); +#else + return (__m256i)__builtin_elementwise_abs((__v4di)__A); +#endif +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_abs_epi64 (__m256i __W, __mmask8 __U, __m256i __A) { + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_abs_epi64(__A), + (__v4di)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_abs_epi64 (__mmask8 __U, __m256i __A) { + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_abs_epi64(__A), + (__v4di)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_max_epi32(__mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, + (__v4si)_mm_max_epi32(__A, __B), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_max_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, + (__v4si)_mm_max_epi32(__A, __B), + (__v4si)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_max_epi32(__mmask8 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, + (__v8si)_mm256_max_epi32(__A, __B), + (__v8si)_mm256_setzero_si256()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_max_epi32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, + (__v8si)_mm256_max_epi32(__A, __B), + (__v8si)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_max_epi64 (__m128i __A, __m128i __B) { +#if (__clang_major__ < 14) + return (__m128i)__builtin_ia32_pmaxsq128((__v2di)__A, (__v2di)__B); +#else + return (__m128i)__builtin_elementwise_max((__v2di)__A, (__v2di)__B); +#endif +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_max_epi64 (__mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, + (__v2di)_mm_max_epi64(__A, __B), + (__v2di)_mm_setzero_si128()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_max_epi64 (__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, + (__v2di)_mm_max_epi64(__A, __B), + (__v2di)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_max_epi64 (__m256i __A, __m256i __B) { +#if (__clang_major__ < 14) + return (__m256i)__builtin_ia32_pmaxsq256((__v4di)__A, (__v4di)__B); +#else + return (__m256i)__builtin_elementwise_max((__v4di)__A, (__v4di)__B); +#endif +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_max_epi64 (__mmask8 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, + (__v4di)_mm256_max_epi64(__A, __B), + (__v4di)_mm256_setzero_si256()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_max_epi64 (__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, + (__v4di)_mm256_max_epi64(__A, __B), + (__v4di)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_max_epu32(__mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, + (__v4si)_mm_max_epu32(__A, __B), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_max_epu32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, + (__v4si)_mm_max_epu32(__A, __B), + (__v4si)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_max_epu32(__mmask8 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, + (__v8si)_mm256_max_epu32(__A, __B), + (__v8si)_mm256_setzero_si256()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_max_epu32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, + (__v8si)_mm256_max_epu32(__A, __B), + (__v8si)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_max_epu64 (__m128i __A, __m128i __B) { +#if (__clang_major__ < 14) + return (__m128i)__builtin_ia32_pmaxuq128((__v2di)__A, (__v2di)__B); +#else + return (__m128i)__builtin_elementwise_max((__v2du)__A, (__v2du)__B); +#endif +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_max_epu64 (__mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, + (__v2di)_mm_max_epu64(__A, __B), + (__v2di)_mm_setzero_si128()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_max_epu64 (__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, + (__v2di)_mm_max_epu64(__A, __B), + (__v2di)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_max_epu64 (__m256i __A, __m256i __B) { +#if (__clang_major__ < 14) + return (__m256i)__builtin_ia32_pmaxuq256((__v4di)__A, (__v4di)__B); +#else + return (__m256i)__builtin_elementwise_max((__v4du)__A, (__v4du)__B); +#endif +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_max_epu64 (__mmask8 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, + (__v4di)_mm256_max_epu64(__A, __B), + (__v4di)_mm256_setzero_si256()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_max_epu64 (__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, + (__v4di)_mm256_max_epu64(__A, __B), + (__v4di)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_min_epi32(__mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, + (__v4si)_mm_min_epi32(__A, __B), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_min_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, + (__v4si)_mm_min_epi32(__A, __B), + (__v4si)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_min_epi32(__mmask8 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, + (__v8si)_mm256_min_epi32(__A, __B), + (__v8si)_mm256_setzero_si256()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_min_epi32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, + (__v8si)_mm256_min_epi32(__A, __B), + (__v8si)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_min_epi64 (__m128i __A, __m128i __B) { +#if (__clang_major__ < 14) + return (__m128i)__builtin_ia32_pminsq128((__v2di)__A, (__v2di)__B); +#else + return (__m128i)__builtin_elementwise_min((__v2di)__A, (__v2di)__B); +#endif +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_min_epi64 (__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, + (__v2di)_mm_min_epi64(__A, __B), + (__v2di)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_min_epi64 (__mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, + (__v2di)_mm_min_epi64(__A, __B), + (__v2di)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_min_epi64 (__m256i __A, __m256i __B) { +#if (__clang_major__ < 14) + return (__m256i)__builtin_ia32_pminsq256((__v4di)__A, (__v4di)__B); +#else + return (__m256i)__builtin_elementwise_min((__v4di)__A, (__v4di)__B); +#endif +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_min_epi64 (__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, + (__v4di)_mm256_min_epi64(__A, __B), + (__v4di)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_min_epi64 (__mmask8 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, + (__v4di)_mm256_min_epi64(__A, __B), + (__v4di)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_min_epu32(__mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, + (__v4si)_mm_min_epu32(__A, __B), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_min_epu32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, + (__v4si)_mm_min_epu32(__A, __B), + (__v4si)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_min_epu32(__mmask8 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, + (__v8si)_mm256_min_epu32(__A, __B), + (__v8si)_mm256_setzero_si256()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_min_epu32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, + (__v8si)_mm256_min_epu32(__A, __B), + (__v8si)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_min_epu64 (__m128i __A, __m128i __B) { +#if (__clang_major__ < 14) + return (__m128i)__builtin_ia32_pminuq128((__v2di)__A, (__v2di)__B); +#else + return (__m128i)__builtin_elementwise_min((__v2du)__A, (__v2du)__B); +#endif +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_min_epu64 (__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, + (__v2di)_mm_min_epu64(__A, __B), + (__v2di)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_min_epu64 (__mmask8 __M, __m128i __A, __m128i __B) { + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, + (__v2di)_mm_min_epu64(__A, __B), + (__v2di)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_min_epu64 (__m256i __A, __m256i __B) { +#if (__clang_major__ < 14) + return (__m256i)__builtin_ia32_pminuq256((__v4di)__A, (__v4di)__B); +#else + return (__m256i)__builtin_elementwise_min((__v4du)__A, (__v4du)__B); +#endif +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_min_epu64 (__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, + (__v4di)_mm256_min_epu64(__A, __B), + (__v4di)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_min_epu64 (__mmask8 __M, __m256i __A, __m256i __B) { + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, + (__v4di)_mm256_min_epu64(__A, __B), + (__v4di)_mm256_setzero_si256()); +} + +#define _mm_roundscale_pd(A, imm) \ + ((__m128d)__builtin_ia32_rndscalepd_128_mask((__v2df)(__m128d)(A), \ + (int)(imm), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)-1)) + + +#define _mm_mask_roundscale_pd(W, U, A, imm) \ + ((__m128d)__builtin_ia32_rndscalepd_128_mask((__v2df)(__m128d)(A), \ + (int)(imm), \ + (__v2df)(__m128d)(W), \ + (__mmask8)(U))) + + +#define _mm_maskz_roundscale_pd(U, A, imm) \ + ((__m128d)__builtin_ia32_rndscalepd_128_mask((__v2df)(__m128d)(A), \ + (int)(imm), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(U))) + + +#define _mm256_roundscale_pd(A, imm) \ + ((__m256d)__builtin_ia32_rndscalepd_256_mask((__v4df)(__m256d)(A), \ + (int)(imm), \ + (__v4df)_mm256_setzero_pd(), \ + (__mmask8)-1)) + + +#define _mm256_mask_roundscale_pd(W, U, A, imm) \ + ((__m256d)__builtin_ia32_rndscalepd_256_mask((__v4df)(__m256d)(A), \ + (int)(imm), \ + (__v4df)(__m256d)(W), \ + (__mmask8)(U))) + + +#define _mm256_maskz_roundscale_pd(U, A, imm) \ + ((__m256d)__builtin_ia32_rndscalepd_256_mask((__v4df)(__m256d)(A), \ + (int)(imm), \ + (__v4df)_mm256_setzero_pd(), \ + (__mmask8)(U))) + +#define _mm_roundscale_ps(A, imm) \ + ((__m128)__builtin_ia32_rndscaleps_128_mask((__v4sf)(__m128)(A), (int)(imm), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)-1)) + + +#define _mm_mask_roundscale_ps(W, U, A, imm) \ + ((__m128)__builtin_ia32_rndscaleps_128_mask((__v4sf)(__m128)(A), (int)(imm), \ + (__v4sf)(__m128)(W), \ + (__mmask8)(U))) + + +#define _mm_maskz_roundscale_ps(U, A, imm) \ + ((__m128)__builtin_ia32_rndscaleps_128_mask((__v4sf)(__m128)(A), (int)(imm), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(U))) + +#define _mm256_roundscale_ps(A, imm) \ + ((__m256)__builtin_ia32_rndscaleps_256_mask((__v8sf)(__m256)(A), (int)(imm), \ + (__v8sf)_mm256_setzero_ps(), \ + (__mmask8)-1)) + +#define _mm256_mask_roundscale_ps(W, U, A, imm) \ + ((__m256)__builtin_ia32_rndscaleps_256_mask((__v8sf)(__m256)(A), (int)(imm), \ + (__v8sf)(__m256)(W), \ + (__mmask8)(U))) + + +#define _mm256_maskz_roundscale_ps(U, A, imm) \ + ((__m256)__builtin_ia32_rndscaleps_256_mask((__v8sf)(__m256)(A), (int)(imm), \ + (__v8sf)_mm256_setzero_ps(), \ + (__mmask8)(U))) + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_scalef_pd (__m128d __A, __m128d __B) { + return (__m128d) __builtin_ia32_scalefpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask_scalef_pd (__m128d __W, __mmask8 __U, __m128d __A, + __m128d __B) { + return (__m128d) __builtin_ia32_scalefpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) __W, + (__mmask8) __U); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_maskz_scalef_pd (__mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d) __builtin_ia32_scalefpd128_mask ((__v2df) __A, + (__v2df) __B, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_scalef_pd (__m256d __A, __m256d __B) { + return (__m256d) __builtin_ia32_scalefpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) -1); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_mask_scalef_pd (__m256d __W, __mmask8 __U, __m256d __A, + __m256d __B) { + return (__m256d) __builtin_ia32_scalefpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) __W, + (__mmask8) __U); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_maskz_scalef_pd (__mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d) __builtin_ia32_scalefpd256_mask ((__v4df) __A, + (__v4df) __B, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_scalef_ps (__m128 __A, __m128 __B) { + return (__m128) __builtin_ia32_scalefps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask_scalef_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { + return (__m128) __builtin_ia32_scalefps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) __W, + (__mmask8) __U); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_scalef_ps (__mmask8 __U, __m128 __A, __m128 __B) { + return (__m128) __builtin_ia32_scalefps128_mask ((__v4sf) __A, + (__v4sf) __B, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_scalef_ps (__m256 __A, __m256 __B) { + return (__m256) __builtin_ia32_scalefps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) -1); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_mask_scalef_ps (__m256 __W, __mmask8 __U, __m256 __A, + __m256 __B) { + return (__m256) __builtin_ia32_scalefps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) __W, + (__mmask8) __U); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { + return (__m256) __builtin_ia32_scalefps256_mask ((__v8sf) __A, + (__v8sf) __B, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} + +#define _mm_i64scatter_pd(addr, index, v1, scale) \ + __builtin_ia32_scatterdiv2df((void *)(addr), (__mmask8)-1, \ + (__v2di)(__m128i)(index), \ + (__v2df)(__m128d)(v1), (int)(scale)) + +#define _mm_mask_i64scatter_pd(addr, mask, index, v1, scale) \ + __builtin_ia32_scatterdiv2df((void *)(addr), (__mmask8)(mask), \ + (__v2di)(__m128i)(index), \ + (__v2df)(__m128d)(v1), (int)(scale)) + +#define _mm_i64scatter_epi64(addr, index, v1, scale) \ + __builtin_ia32_scatterdiv2di((void *)(addr), (__mmask8)-1, \ + (__v2di)(__m128i)(index), \ + (__v2di)(__m128i)(v1), (int)(scale)) + +#define _mm_mask_i64scatter_epi64(addr, mask, index, v1, scale) \ + __builtin_ia32_scatterdiv2di((void *)(addr), (__mmask8)(mask), \ + (__v2di)(__m128i)(index), \ + (__v2di)(__m128i)(v1), (int)(scale)) + +#define _mm256_i64scatter_pd(addr, index, v1, scale) \ + __builtin_ia32_scatterdiv4df((void *)(addr), (__mmask8)-1, \ + (__v4di)(__m256i)(index), \ + (__v4df)(__m256d)(v1), (int)(scale)) + +#define _mm256_mask_i64scatter_pd(addr, mask, index, v1, scale) \ + __builtin_ia32_scatterdiv4df((void *)(addr), (__mmask8)(mask), \ + (__v4di)(__m256i)(index), \ + (__v4df)(__m256d)(v1), (int)(scale)) + +#define _mm256_i64scatter_epi64(addr, index, v1, scale) \ + __builtin_ia32_scatterdiv4di((void *)(addr), (__mmask8)-1, \ + (__v4di)(__m256i)(index), \ + (__v4di)(__m256i)(v1), (int)(scale)) + +#define _mm256_mask_i64scatter_epi64(addr, mask, index, v1, scale) \ + __builtin_ia32_scatterdiv4di((void *)(addr), (__mmask8)(mask), \ + (__v4di)(__m256i)(index), \ + (__v4di)(__m256i)(v1), (int)(scale)) + +#define _mm_i64scatter_ps(addr, index, v1, scale) \ + __builtin_ia32_scatterdiv4sf((void *)(addr), (__mmask8)-1, \ + (__v2di)(__m128i)(index), (__v4sf)(__m128)(v1), \ + (int)(scale)) + +#define _mm_mask_i64scatter_ps(addr, mask, index, v1, scale) \ + __builtin_ia32_scatterdiv4sf((void *)(addr), (__mmask8)(mask), \ + (__v2di)(__m128i)(index), (__v4sf)(__m128)(v1), \ + (int)(scale)) + +#define _mm_i64scatter_epi32(addr, index, v1, scale) \ + __builtin_ia32_scatterdiv4si((void *)(addr), (__mmask8)-1, \ + (__v2di)(__m128i)(index), \ + (__v4si)(__m128i)(v1), (int)(scale)) + +#define _mm_mask_i64scatter_epi32(addr, mask, index, v1, scale) \ + __builtin_ia32_scatterdiv4si((void *)(addr), (__mmask8)(mask), \ + (__v2di)(__m128i)(index), \ + (__v4si)(__m128i)(v1), (int)(scale)) + +#define _mm256_i64scatter_ps(addr, index, v1, scale) \ + __builtin_ia32_scatterdiv8sf((void *)(addr), (__mmask8)-1, \ + (__v4di)(__m256i)(index), (__v4sf)(__m128)(v1), \ + (int)(scale)) + +#define _mm256_mask_i64scatter_ps(addr, mask, index, v1, scale) \ + __builtin_ia32_scatterdiv8sf((void *)(addr), (__mmask8)(mask), \ + (__v4di)(__m256i)(index), (__v4sf)(__m128)(v1), \ + (int)(scale)) + +#define _mm256_i64scatter_epi32(addr, index, v1, scale) \ + __builtin_ia32_scatterdiv8si((void *)(addr), (__mmask8)-1, \ + (__v4di)(__m256i)(index), \ + (__v4si)(__m128i)(v1), (int)(scale)) + +#define _mm256_mask_i64scatter_epi32(addr, mask, index, v1, scale) \ + __builtin_ia32_scatterdiv8si((void *)(addr), (__mmask8)(mask), \ + (__v4di)(__m256i)(index), \ + (__v4si)(__m128i)(v1), (int)(scale)) + +#define _mm_i32scatter_pd(addr, index, v1, scale) \ + __builtin_ia32_scattersiv2df((void *)(addr), (__mmask8)-1, \ + (__v4si)(__m128i)(index), \ + (__v2df)(__m128d)(v1), (int)(scale)) + +#define _mm_mask_i32scatter_pd(addr, mask, index, v1, scale) \ + __builtin_ia32_scattersiv2df((void *)(addr), (__mmask8)(mask), \ + (__v4si)(__m128i)(index), \ + (__v2df)(__m128d)(v1), (int)(scale)) + +#define _mm_i32scatter_epi64(addr, index, v1, scale) \ + __builtin_ia32_scattersiv2di((void *)(addr), (__mmask8)-1, \ + (__v4si)(__m128i)(index), \ + (__v2di)(__m128i)(v1), (int)(scale)) + +#define _mm_mask_i32scatter_epi64(addr, mask, index, v1, scale) \ + __builtin_ia32_scattersiv2di((void *)(addr), (__mmask8)(mask), \ + (__v4si)(__m128i)(index), \ + (__v2di)(__m128i)(v1), (int)(scale)) + +#define _mm256_i32scatter_pd(addr, index, v1, scale) \ + __builtin_ia32_scattersiv4df((void *)(addr), (__mmask8)-1, \ + (__v4si)(__m128i)(index), \ + (__v4df)(__m256d)(v1), (int)(scale)) + +#define _mm256_mask_i32scatter_pd(addr, mask, index, v1, scale) \ + __builtin_ia32_scattersiv4df((void *)(addr), (__mmask8)(mask), \ + (__v4si)(__m128i)(index), \ + (__v4df)(__m256d)(v1), (int)(scale)) + +#define _mm256_i32scatter_epi64(addr, index, v1, scale) \ + __builtin_ia32_scattersiv4di((void *)(addr), (__mmask8)-1, \ + (__v4si)(__m128i)(index), \ + (__v4di)(__m256i)(v1), (int)(scale)) + +#define _mm256_mask_i32scatter_epi64(addr, mask, index, v1, scale) \ + __builtin_ia32_scattersiv4di((void *)(addr), (__mmask8)(mask), \ + (__v4si)(__m128i)(index), \ + (__v4di)(__m256i)(v1), (int)(scale)) + +#define _mm_i32scatter_ps(addr, index, v1, scale) \ + __builtin_ia32_scattersiv4sf((void *)(addr), (__mmask8)-1, \ + (__v4si)(__m128i)(index), (__v4sf)(__m128)(v1), \ + (int)(scale)) + +#define _mm_mask_i32scatter_ps(addr, mask, index, v1, scale) \ + __builtin_ia32_scattersiv4sf((void *)(addr), (__mmask8)(mask), \ + (__v4si)(__m128i)(index), (__v4sf)(__m128)(v1), \ + (int)(scale)) + +#define _mm_i32scatter_epi32(addr, index, v1, scale) \ + __builtin_ia32_scattersiv4si((void *)(addr), (__mmask8)-1, \ + (__v4si)(__m128i)(index), \ + (__v4si)(__m128i)(v1), (int)(scale)) + +#define _mm_mask_i32scatter_epi32(addr, mask, index, v1, scale) \ + __builtin_ia32_scattersiv4si((void *)(addr), (__mmask8)(mask), \ + (__v4si)(__m128i)(index), \ + (__v4si)(__m128i)(v1), (int)(scale)) + +#define _mm256_i32scatter_ps(addr, index, v1, scale) \ + __builtin_ia32_scattersiv8sf((void *)(addr), (__mmask8)-1, \ + (__v8si)(__m256i)(index), (__v8sf)(__m256)(v1), \ + (int)(scale)) + +#define _mm256_mask_i32scatter_ps(addr, mask, index, v1, scale) \ + __builtin_ia32_scattersiv8sf((void *)(addr), (__mmask8)(mask), \ + (__v8si)(__m256i)(index), (__v8sf)(__m256)(v1), \ + (int)(scale)) + +#define _mm256_i32scatter_epi32(addr, index, v1, scale) \ + __builtin_ia32_scattersiv8si((void *)(addr), (__mmask8)-1, \ + (__v8si)(__m256i)(index), \ + (__v8si)(__m256i)(v1), (int)(scale)) + +#define _mm256_mask_i32scatter_epi32(addr, mask, index, v1, scale) \ + __builtin_ia32_scattersiv8si((void *)(addr), (__mmask8)(mask), \ + (__v8si)(__m256i)(index), \ + (__v8si)(__m256i)(v1), (int)(scale)) + + static __inline__ __m128d __DEFAULT_FN_ATTRS128 + _mm_mask_sqrt_pd(__m128d __W, __mmask8 __U, __m128d __A) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_sqrt_pd(__A), + (__v2df)__W); + } + + static __inline__ __m128d __DEFAULT_FN_ATTRS128 + _mm_maskz_sqrt_pd(__mmask8 __U, __m128d __A) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_sqrt_pd(__A), + (__v2df)_mm_setzero_pd()); + } + + static __inline__ __m256d __DEFAULT_FN_ATTRS256 + _mm256_mask_sqrt_pd(__m256d __W, __mmask8 __U, __m256d __A) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_sqrt_pd(__A), + (__v4df)__W); + } + + static __inline__ __m256d __DEFAULT_FN_ATTRS256 + _mm256_maskz_sqrt_pd(__mmask8 __U, __m256d __A) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_sqrt_pd(__A), + (__v4df)_mm256_setzero_pd()); + } + + static __inline__ __m128 __DEFAULT_FN_ATTRS128 + _mm_mask_sqrt_ps(__m128 __W, __mmask8 __U, __m128 __A) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_sqrt_ps(__A), + (__v4sf)__W); + } + + static __inline__ __m128 __DEFAULT_FN_ATTRS128 + _mm_maskz_sqrt_ps(__mmask8 __U, __m128 __A) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_sqrt_ps(__A), + (__v4sf)_mm_setzero_ps()); + } + + static __inline__ __m256 __DEFAULT_FN_ATTRS256 + _mm256_mask_sqrt_ps(__m256 __W, __mmask8 __U, __m256 __A) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_sqrt_ps(__A), + (__v8sf)__W); + } + + static __inline__ __m256 __DEFAULT_FN_ATTRS256 + _mm256_maskz_sqrt_ps(__mmask8 __U, __m256 __A) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_sqrt_ps(__A), + (__v8sf)_mm256_setzero_ps()); + } + + static __inline__ __m128d __DEFAULT_FN_ATTRS128 + _mm_mask_sub_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_sub_pd(__A, __B), + (__v2df)__W); + } + + static __inline__ __m128d __DEFAULT_FN_ATTRS128 + _mm_maskz_sub_pd(__mmask8 __U, __m128d __A, __m128d __B) { + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_sub_pd(__A, __B), + (__v2df)_mm_setzero_pd()); + } + + static __inline__ __m256d __DEFAULT_FN_ATTRS256 + _mm256_mask_sub_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_sub_pd(__A, __B), + (__v4df)__W); + } + + static __inline__ __m256d __DEFAULT_FN_ATTRS256 + _mm256_maskz_sub_pd(__mmask8 __U, __m256d __A, __m256d __B) { + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_sub_pd(__A, __B), + (__v4df)_mm256_setzero_pd()); + } + + static __inline__ __m128 __DEFAULT_FN_ATTRS128 + _mm_mask_sub_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_sub_ps(__A, __B), + (__v4sf)__W); + } + + static __inline__ __m128 __DEFAULT_FN_ATTRS128 + _mm_maskz_sub_ps(__mmask8 __U, __m128 __A, __m128 __B) { + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_sub_ps(__A, __B), + (__v4sf)_mm_setzero_ps()); + } + + static __inline__ __m256 __DEFAULT_FN_ATTRS256 + _mm256_mask_sub_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_sub_ps(__A, __B), + (__v8sf)__W); + } + + static __inline__ __m256 __DEFAULT_FN_ATTRS256 + _mm256_maskz_sub_ps(__mmask8 __U, __m256 __A, __m256 __B) { + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_sub_ps(__A, __B), + (__v8sf)_mm256_setzero_ps()); + } + + static __inline__ __m128i __DEFAULT_FN_ATTRS128 + _mm_permutex2var_epi32(__m128i __A, __m128i __I, __m128i __B) { + return (__m128i)__builtin_ia32_vpermi2vard128((__v4si) __A, (__v4si)__I, + (__v4si)__B); + } + + static __inline__ __m128i __DEFAULT_FN_ATTRS128 + _mm_mask_permutex2var_epi32(__m128i __A, __mmask8 __U, __m128i __I, + __m128i __B) { + return (__m128i)__builtin_ia32_selectd_128(__U, + (__v4si)_mm_permutex2var_epi32(__A, __I, __B), + (__v4si)__A); + } + + static __inline__ __m128i __DEFAULT_FN_ATTRS128 + _mm_mask2_permutex2var_epi32(__m128i __A, __m128i __I, __mmask8 __U, + __m128i __B) { + return (__m128i)__builtin_ia32_selectd_128(__U, + (__v4si)_mm_permutex2var_epi32(__A, __I, __B), + (__v4si)__I); + } + + static __inline__ __m128i __DEFAULT_FN_ATTRS128 + _mm_maskz_permutex2var_epi32(__mmask8 __U, __m128i __A, __m128i __I, + __m128i __B) { + return (__m128i)__builtin_ia32_selectd_128(__U, + (__v4si)_mm_permutex2var_epi32(__A, __I, __B), + (__v4si)_mm_setzero_si128()); + } + + static __inline__ __m256i __DEFAULT_FN_ATTRS256 + _mm256_permutex2var_epi32(__m256i __A, __m256i __I, __m256i __B) { + return (__m256i)__builtin_ia32_vpermi2vard256((__v8si)__A, (__v8si) __I, + (__v8si) __B); + } + + static __inline__ __m256i __DEFAULT_FN_ATTRS256 + _mm256_mask_permutex2var_epi32(__m256i __A, __mmask8 __U, __m256i __I, + __m256i __B) { + return (__m256i)__builtin_ia32_selectd_256(__U, + (__v8si)_mm256_permutex2var_epi32(__A, __I, __B), + (__v8si)__A); + } + + static __inline__ __m256i __DEFAULT_FN_ATTRS256 + _mm256_mask2_permutex2var_epi32(__m256i __A, __m256i __I, __mmask8 __U, + __m256i __B) { + return (__m256i)__builtin_ia32_selectd_256(__U, + (__v8si)_mm256_permutex2var_epi32(__A, __I, __B), + (__v8si)__I); + } + + static __inline__ __m256i __DEFAULT_FN_ATTRS256 + _mm256_maskz_permutex2var_epi32(__mmask8 __U, __m256i __A, __m256i __I, + __m256i __B) { + return (__m256i)__builtin_ia32_selectd_256(__U, + (__v8si)_mm256_permutex2var_epi32(__A, __I, __B), + (__v8si)_mm256_setzero_si256()); + } + + static __inline__ __m128d __DEFAULT_FN_ATTRS128 + _mm_permutex2var_pd(__m128d __A, __m128i __I, __m128d __B) { + return (__m128d)__builtin_ia32_vpermi2varpd128((__v2df)__A, (__v2di)__I, + (__v2df)__B); + } + + static __inline__ __m128d __DEFAULT_FN_ATTRS128 + _mm_mask_permutex2var_pd(__m128d __A, __mmask8 __U, __m128i __I, __m128d __B) { + return (__m128d)__builtin_ia32_selectpd_128(__U, + (__v2df)_mm_permutex2var_pd(__A, __I, __B), + (__v2df)__A); + } + + static __inline__ __m128d __DEFAULT_FN_ATTRS128 + _mm_mask2_permutex2var_pd(__m128d __A, __m128i __I, __mmask8 __U, __m128d __B) { + return (__m128d)__builtin_ia32_selectpd_128(__U, + (__v2df)_mm_permutex2var_pd(__A, __I, __B), + (__v2df)(__m128d)__I); + } + + static __inline__ __m128d __DEFAULT_FN_ATTRS128 + _mm_maskz_permutex2var_pd(__mmask8 __U, __m128d __A, __m128i __I, __m128d __B) { + return (__m128d)__builtin_ia32_selectpd_128(__U, + (__v2df)_mm_permutex2var_pd(__A, __I, __B), + (__v2df)_mm_setzero_pd()); + } + + static __inline__ __m256d __DEFAULT_FN_ATTRS256 + _mm256_permutex2var_pd(__m256d __A, __m256i __I, __m256d __B) { + return (__m256d)__builtin_ia32_vpermi2varpd256((__v4df)__A, (__v4di)__I, + (__v4df)__B); + } + + static __inline__ __m256d __DEFAULT_FN_ATTRS256 + _mm256_mask_permutex2var_pd(__m256d __A, __mmask8 __U, __m256i __I, + __m256d __B) { + return (__m256d)__builtin_ia32_selectpd_256(__U, + (__v4df)_mm256_permutex2var_pd(__A, __I, __B), + (__v4df)__A); + } + + static __inline__ __m256d __DEFAULT_FN_ATTRS256 + _mm256_mask2_permutex2var_pd(__m256d __A, __m256i __I, __mmask8 __U, + __m256d __B) { + return (__m256d)__builtin_ia32_selectpd_256(__U, + (__v4df)_mm256_permutex2var_pd(__A, __I, __B), + (__v4df)(__m256d)__I); + } + + static __inline__ __m256d __DEFAULT_FN_ATTRS256 + _mm256_maskz_permutex2var_pd(__mmask8 __U, __m256d __A, __m256i __I, + __m256d __B) { + return (__m256d)__builtin_ia32_selectpd_256(__U, + (__v4df)_mm256_permutex2var_pd(__A, __I, __B), + (__v4df)_mm256_setzero_pd()); + } + + static __inline__ __m128 __DEFAULT_FN_ATTRS128 + _mm_permutex2var_ps(__m128 __A, __m128i __I, __m128 __B) { + return (__m128)__builtin_ia32_vpermi2varps128((__v4sf)__A, (__v4si)__I, + (__v4sf)__B); + } + + static __inline__ __m128 __DEFAULT_FN_ATTRS128 + _mm_mask_permutex2var_ps(__m128 __A, __mmask8 __U, __m128i __I, __m128 __B) { + return (__m128)__builtin_ia32_selectps_128(__U, + (__v4sf)_mm_permutex2var_ps(__A, __I, __B), + (__v4sf)__A); + } + + static __inline__ __m128 __DEFAULT_FN_ATTRS128 + _mm_mask2_permutex2var_ps(__m128 __A, __m128i __I, __mmask8 __U, __m128 __B) { + return (__m128)__builtin_ia32_selectps_128(__U, + (__v4sf)_mm_permutex2var_ps(__A, __I, __B), + (__v4sf)(__m128)__I); + } + + static __inline__ __m128 __DEFAULT_FN_ATTRS128 + _mm_maskz_permutex2var_ps(__mmask8 __U, __m128 __A, __m128i __I, __m128 __B) { + return (__m128)__builtin_ia32_selectps_128(__U, + (__v4sf)_mm_permutex2var_ps(__A, __I, __B), + (__v4sf)_mm_setzero_ps()); + } + + static __inline__ __m256 __DEFAULT_FN_ATTRS256 + _mm256_permutex2var_ps(__m256 __A, __m256i __I, __m256 __B) { + return (__m256)__builtin_ia32_vpermi2varps256((__v8sf)__A, (__v8si)__I, + (__v8sf) __B); + } + + static __inline__ __m256 __DEFAULT_FN_ATTRS256 + _mm256_mask_permutex2var_ps(__m256 __A, __mmask8 __U, __m256i __I, __m256 __B) { + return (__m256)__builtin_ia32_selectps_256(__U, + (__v8sf)_mm256_permutex2var_ps(__A, __I, __B), + (__v8sf)__A); + } + + static __inline__ __m256 __DEFAULT_FN_ATTRS256 + _mm256_mask2_permutex2var_ps(__m256 __A, __m256i __I, __mmask8 __U, + __m256 __B) { + return (__m256)__builtin_ia32_selectps_256(__U, + (__v8sf)_mm256_permutex2var_ps(__A, __I, __B), + (__v8sf)(__m256)__I); + } + + static __inline__ __m256 __DEFAULT_FN_ATTRS256 + _mm256_maskz_permutex2var_ps(__mmask8 __U, __m256 __A, __m256i __I, + __m256 __B) { + return (__m256)__builtin_ia32_selectps_256(__U, + (__v8sf)_mm256_permutex2var_ps(__A, __I, __B), + (__v8sf)_mm256_setzero_ps()); + } + + static __inline__ __m128i __DEFAULT_FN_ATTRS128 + _mm_permutex2var_epi64(__m128i __A, __m128i __I, __m128i __B) { + return (__m128i)__builtin_ia32_vpermi2varq128((__v2di)__A, (__v2di)__I, + (__v2di)__B); + } + + static __inline__ __m128i __DEFAULT_FN_ATTRS128 + _mm_mask_permutex2var_epi64(__m128i __A, __mmask8 __U, __m128i __I, + __m128i __B) { + return (__m128i)__builtin_ia32_selectq_128(__U, + (__v2di)_mm_permutex2var_epi64(__A, __I, __B), + (__v2di)__A); + } + + static __inline__ __m128i __DEFAULT_FN_ATTRS128 + _mm_mask2_permutex2var_epi64(__m128i __A, __m128i __I, __mmask8 __U, + __m128i __B) { + return (__m128i)__builtin_ia32_selectq_128(__U, + (__v2di)_mm_permutex2var_epi64(__A, __I, __B), + (__v2di)__I); + } + + static __inline__ __m128i __DEFAULT_FN_ATTRS128 + _mm_maskz_permutex2var_epi64(__mmask8 __U, __m128i __A, __m128i __I, + __m128i __B) { + return (__m128i)__builtin_ia32_selectq_128(__U, + (__v2di)_mm_permutex2var_epi64(__A, __I, __B), + (__v2di)_mm_setzero_si128()); + } + + + static __inline__ __m256i __DEFAULT_FN_ATTRS256 + _mm256_permutex2var_epi64(__m256i __A, __m256i __I, __m256i __B) { + return (__m256i)__builtin_ia32_vpermi2varq256((__v4di)__A, (__v4di) __I, + (__v4di) __B); + } + + static __inline__ __m256i __DEFAULT_FN_ATTRS256 + _mm256_mask_permutex2var_epi64(__m256i __A, __mmask8 __U, __m256i __I, + __m256i __B) { + return (__m256i)__builtin_ia32_selectq_256(__U, + (__v4di)_mm256_permutex2var_epi64(__A, __I, __B), + (__v4di)__A); + } + + static __inline__ __m256i __DEFAULT_FN_ATTRS256 + _mm256_mask2_permutex2var_epi64(__m256i __A, __m256i __I, __mmask8 __U, + __m256i __B) { + return (__m256i)__builtin_ia32_selectq_256(__U, + (__v4di)_mm256_permutex2var_epi64(__A, __I, __B), + (__v4di)__I); + } + + static __inline__ __m256i __DEFAULT_FN_ATTRS256 + _mm256_maskz_permutex2var_epi64(__mmask8 __U, __m256i __A, __m256i __I, + __m256i __B) { + return (__m256i)__builtin_ia32_selectq_256(__U, + (__v4di)_mm256_permutex2var_epi64(__A, __I, __B), + (__v4di)_mm256_setzero_si256()); + } + + static __inline__ __m128i __DEFAULT_FN_ATTRS128 + _mm_mask_cvtepi8_epi32(__m128i __W, __mmask8 __U, __m128i __A) + { + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_cvtepi8_epi32(__A), + (__v4si)__W); + } + + static __inline__ __m128i __DEFAULT_FN_ATTRS128 + _mm_maskz_cvtepi8_epi32(__mmask8 __U, __m128i __A) + { + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_cvtepi8_epi32(__A), + (__v4si)_mm_setzero_si128()); + } + + static __inline__ __m256i __DEFAULT_FN_ATTRS256 + _mm256_mask_cvtepi8_epi32 (__m256i __W, __mmask8 __U, __m128i __A) + { + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_cvtepi8_epi32(__A), + (__v8si)__W); + } + + static __inline__ __m256i __DEFAULT_FN_ATTRS256 + _mm256_maskz_cvtepi8_epi32 (__mmask8 __U, __m128i __A) + { + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_cvtepi8_epi32(__A), + (__v8si)_mm256_setzero_si256()); + } + + static __inline__ __m128i __DEFAULT_FN_ATTRS128 + _mm_mask_cvtepi8_epi64(__m128i __W, __mmask8 __U, __m128i __A) + { + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_cvtepi8_epi64(__A), + (__v2di)__W); + } + + static __inline__ __m128i __DEFAULT_FN_ATTRS128 + _mm_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A) + { + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_cvtepi8_epi64(__A), + (__v2di)_mm_setzero_si128()); + } + + static __inline__ __m256i __DEFAULT_FN_ATTRS256 + _mm256_mask_cvtepi8_epi64(__m256i __W, __mmask8 __U, __m128i __A) + { + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_cvtepi8_epi64(__A), + (__v4di)__W); + } + + static __inline__ __m256i __DEFAULT_FN_ATTRS256 + _mm256_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A) + { + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_cvtepi8_epi64(__A), + (__v4di)_mm256_setzero_si256()); + } + + static __inline__ __m128i __DEFAULT_FN_ATTRS128 + _mm_mask_cvtepi32_epi64(__m128i __W, __mmask8 __U, __m128i __X) + { + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_cvtepi32_epi64(__X), + (__v2di)__W); + } + + static __inline__ __m128i __DEFAULT_FN_ATTRS128 + _mm_maskz_cvtepi32_epi64(__mmask8 __U, __m128i __X) + { + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_cvtepi32_epi64(__X), + (__v2di)_mm_setzero_si128()); + } + + static __inline__ __m256i __DEFAULT_FN_ATTRS256 + _mm256_mask_cvtepi32_epi64(__m256i __W, __mmask8 __U, __m128i __X) + { + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_cvtepi32_epi64(__X), + (__v4di)__W); + } + + static __inline__ __m256i __DEFAULT_FN_ATTRS256 + _mm256_maskz_cvtepi32_epi64(__mmask8 __U, __m128i __X) + { + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_cvtepi32_epi64(__X), + (__v4di)_mm256_setzero_si256()); + } + + static __inline__ __m128i __DEFAULT_FN_ATTRS128 + _mm_mask_cvtepi16_epi32(__m128i __W, __mmask8 __U, __m128i __A) + { + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_cvtepi16_epi32(__A), + (__v4si)__W); + } + + static __inline__ __m128i __DEFAULT_FN_ATTRS128 + _mm_maskz_cvtepi16_epi32(__mmask8 __U, __m128i __A) + { + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_cvtepi16_epi32(__A), + (__v4si)_mm_setzero_si128()); + } + + static __inline__ __m256i __DEFAULT_FN_ATTRS256 + _mm256_mask_cvtepi16_epi32(__m256i __W, __mmask8 __U, __m128i __A) + { + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_cvtepi16_epi32(__A), + (__v8si)__W); + } + + static __inline__ __m256i __DEFAULT_FN_ATTRS256 + _mm256_maskz_cvtepi16_epi32 (__mmask8 __U, __m128i __A) + { + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_cvtepi16_epi32(__A), + (__v8si)_mm256_setzero_si256()); + } + + static __inline__ __m128i __DEFAULT_FN_ATTRS128 + _mm_mask_cvtepi16_epi64(__m128i __W, __mmask8 __U, __m128i __A) + { + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_cvtepi16_epi64(__A), + (__v2di)__W); + } + + static __inline__ __m128i __DEFAULT_FN_ATTRS128 + _mm_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A) + { + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_cvtepi16_epi64(__A), + (__v2di)_mm_setzero_si128()); + } + + static __inline__ __m256i __DEFAULT_FN_ATTRS256 + _mm256_mask_cvtepi16_epi64(__m256i __W, __mmask8 __U, __m128i __A) + { + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_cvtepi16_epi64(__A), + (__v4di)__W); + } + + static __inline__ __m256i __DEFAULT_FN_ATTRS256 + _mm256_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A) + { + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_cvtepi16_epi64(__A), + (__v4di)_mm256_setzero_si256()); + } + + + static __inline__ __m128i __DEFAULT_FN_ATTRS128 + _mm_mask_cvtepu8_epi32(__m128i __W, __mmask8 __U, __m128i __A) + { + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_cvtepu8_epi32(__A), + (__v4si)__W); + } + + static __inline__ __m128i __DEFAULT_FN_ATTRS128 + _mm_maskz_cvtepu8_epi32(__mmask8 __U, __m128i __A) + { + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_cvtepu8_epi32(__A), + (__v4si)_mm_setzero_si128()); + } + + static __inline__ __m256i __DEFAULT_FN_ATTRS256 + _mm256_mask_cvtepu8_epi32(__m256i __W, __mmask8 __U, __m128i __A) + { + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_cvtepu8_epi32(__A), + (__v8si)__W); + } + + static __inline__ __m256i __DEFAULT_FN_ATTRS256 + _mm256_maskz_cvtepu8_epi32(__mmask8 __U, __m128i __A) + { + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_cvtepu8_epi32(__A), + (__v8si)_mm256_setzero_si256()); + } + + static __inline__ __m128i __DEFAULT_FN_ATTRS128 + _mm_mask_cvtepu8_epi64(__m128i __W, __mmask8 __U, __m128i __A) + { + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_cvtepu8_epi64(__A), + (__v2di)__W); + } + + static __inline__ __m128i __DEFAULT_FN_ATTRS128 + _mm_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A) + { + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_cvtepu8_epi64(__A), + (__v2di)_mm_setzero_si128()); + } + + static __inline__ __m256i __DEFAULT_FN_ATTRS256 + _mm256_mask_cvtepu8_epi64(__m256i __W, __mmask8 __U, __m128i __A) + { + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_cvtepu8_epi64(__A), + (__v4di)__W); + } + + static __inline__ __m256i __DEFAULT_FN_ATTRS256 + _mm256_maskz_cvtepu8_epi64 (__mmask8 __U, __m128i __A) + { + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_cvtepu8_epi64(__A), + (__v4di)_mm256_setzero_si256()); + } + + static __inline__ __m128i __DEFAULT_FN_ATTRS128 + _mm_mask_cvtepu32_epi64(__m128i __W, __mmask8 __U, __m128i __X) + { + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_cvtepu32_epi64(__X), + (__v2di)__W); + } + + static __inline__ __m128i __DEFAULT_FN_ATTRS128 + _mm_maskz_cvtepu32_epi64(__mmask8 __U, __m128i __X) + { + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_cvtepu32_epi64(__X), + (__v2di)_mm_setzero_si128()); + } + + static __inline__ __m256i __DEFAULT_FN_ATTRS256 + _mm256_mask_cvtepu32_epi64(__m256i __W, __mmask8 __U, __m128i __X) + { + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_cvtepu32_epi64(__X), + (__v4di)__W); + } + + static __inline__ __m256i __DEFAULT_FN_ATTRS256 + _mm256_maskz_cvtepu32_epi64(__mmask8 __U, __m128i __X) + { + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_cvtepu32_epi64(__X), + (__v4di)_mm256_setzero_si256()); + } + + static __inline__ __m128i __DEFAULT_FN_ATTRS128 + _mm_mask_cvtepu16_epi32(__m128i __W, __mmask8 __U, __m128i __A) + { + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_cvtepu16_epi32(__A), + (__v4si)__W); + } + + static __inline__ __m128i __DEFAULT_FN_ATTRS128 + _mm_maskz_cvtepu16_epi32(__mmask8 __U, __m128i __A) + { + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_cvtepu16_epi32(__A), + (__v4si)_mm_setzero_si128()); + } + + static __inline__ __m256i __DEFAULT_FN_ATTRS256 + _mm256_mask_cvtepu16_epi32(__m256i __W, __mmask8 __U, __m128i __A) + { + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_cvtepu16_epi32(__A), + (__v8si)__W); + } + + static __inline__ __m256i __DEFAULT_FN_ATTRS256 + _mm256_maskz_cvtepu16_epi32(__mmask8 __U, __m128i __A) + { + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_cvtepu16_epi32(__A), + (__v8si)_mm256_setzero_si256()); + } + + static __inline__ __m128i __DEFAULT_FN_ATTRS128 + _mm_mask_cvtepu16_epi64(__m128i __W, __mmask8 __U, __m128i __A) + { + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_cvtepu16_epi64(__A), + (__v2di)__W); + } + + static __inline__ __m128i __DEFAULT_FN_ATTRS128 + _mm_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A) + { + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_cvtepu16_epi64(__A), + (__v2di)_mm_setzero_si128()); + } + + static __inline__ __m256i __DEFAULT_FN_ATTRS256 + _mm256_mask_cvtepu16_epi64(__m256i __W, __mmask8 __U, __m128i __A) + { + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_cvtepu16_epi64(__A), + (__v4di)__W); + } + + static __inline__ __m256i __DEFAULT_FN_ATTRS256 + _mm256_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A) + { + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_cvtepu16_epi64(__A), + (__v4di)_mm256_setzero_si256()); + } + + +#define _mm_rol_epi32(a, b) \ + ((__m128i)__builtin_ia32_prold128((__v4si)(__m128i)(a), (int)(b))) + +#define _mm_mask_rol_epi32(w, u, a, b) \ + ((__m128i)__builtin_ia32_selectd_128((__mmask8)(u), \ + (__v4si)_mm_rol_epi32((a), (b)), \ + (__v4si)(__m128i)(w))) + +#define _mm_maskz_rol_epi32(u, a, b) \ + ((__m128i)__builtin_ia32_selectd_128((__mmask8)(u), \ + (__v4si)_mm_rol_epi32((a), (b)), \ + (__v4si)_mm_setzero_si128())) + +#define _mm256_rol_epi32(a, b) \ + ((__m256i)__builtin_ia32_prold256((__v8si)(__m256i)(a), (int)(b))) + +#define _mm256_mask_rol_epi32(w, u, a, b) \ + ((__m256i)__builtin_ia32_selectd_256((__mmask8)(u), \ + (__v8si)_mm256_rol_epi32((a), (b)), \ + (__v8si)(__m256i)(w))) + +#define _mm256_maskz_rol_epi32(u, a, b) \ + ((__m256i)__builtin_ia32_selectd_256((__mmask8)(u), \ + (__v8si)_mm256_rol_epi32((a), (b)), \ + (__v8si)_mm256_setzero_si256())) + +#define _mm_rol_epi64(a, b) \ + ((__m128i)__builtin_ia32_prolq128((__v2di)(__m128i)(a), (int)(b))) + +#define _mm_mask_rol_epi64(w, u, a, b) \ + ((__m128i)__builtin_ia32_selectq_128((__mmask8)(u), \ + (__v2di)_mm_rol_epi64((a), (b)), \ + (__v2di)(__m128i)(w))) + +#define _mm_maskz_rol_epi64(u, a, b) \ + ((__m128i)__builtin_ia32_selectq_128((__mmask8)(u), \ + (__v2di)_mm_rol_epi64((a), (b)), \ + (__v2di)_mm_setzero_si128())) + +#define _mm256_rol_epi64(a, b) \ + ((__m256i)__builtin_ia32_prolq256((__v4di)(__m256i)(a), (int)(b))) + +#define _mm256_mask_rol_epi64(w, u, a, b) \ + ((__m256i)__builtin_ia32_selectq_256((__mmask8)(u), \ + (__v4di)_mm256_rol_epi64((a), (b)), \ + (__v4di)(__m256i)(w))) + +#define _mm256_maskz_rol_epi64(u, a, b) \ + ((__m256i)__builtin_ia32_selectq_256((__mmask8)(u), \ + (__v4di)_mm256_rol_epi64((a), (b)), \ + (__v4di)_mm256_setzero_si256())) + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_rolv_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_prolvd128((__v4si)__A, (__v4si)__B); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_rolv_epi32 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectd_128(__U, + (__v4si)_mm_rolv_epi32(__A, __B), + (__v4si)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_rolv_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectd_128(__U, + (__v4si)_mm_rolv_epi32(__A, __B), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_rolv_epi32 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_prolvd256((__v8si)__A, (__v8si)__B); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_rolv_epi32 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectd_256(__U, + (__v8si)_mm256_rolv_epi32(__A, __B), + (__v8si)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_rolv_epi32 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectd_256(__U, + (__v8si)_mm256_rolv_epi32(__A, __B), + (__v8si)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_rolv_epi64 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_prolvq128((__v2di)__A, (__v2di)__B); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_rolv_epi64 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectq_128(__U, + (__v2di)_mm_rolv_epi64(__A, __B), + (__v2di)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_rolv_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectq_128(__U, + (__v2di)_mm_rolv_epi64(__A, __B), + (__v2di)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_rolv_epi64 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_prolvq256((__v4di)__A, (__v4di)__B); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_rolv_epi64 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectq_256(__U, + (__v4di)_mm256_rolv_epi64(__A, __B), + (__v4di)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_rolv_epi64 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectq_256(__U, + (__v4di)_mm256_rolv_epi64(__A, __B), + (__v4di)_mm256_setzero_si256()); +} + +#define _mm_ror_epi32(a, b) \ + ((__m128i)__builtin_ia32_prord128((__v4si)(__m128i)(a), (int)(b))) + +#define _mm_mask_ror_epi32(w, u, a, b) \ + ((__m128i)__builtin_ia32_selectd_128((__mmask8)(u), \ + (__v4si)_mm_ror_epi32((a), (b)), \ + (__v4si)(__m128i)(w))) + +#define _mm_maskz_ror_epi32(u, a, b) \ + ((__m128i)__builtin_ia32_selectd_128((__mmask8)(u), \ + (__v4si)_mm_ror_epi32((a), (b)), \ + (__v4si)_mm_setzero_si128())) + +#define _mm256_ror_epi32(a, b) \ + ((__m256i)__builtin_ia32_prord256((__v8si)(__m256i)(a), (int)(b))) + +#define _mm256_mask_ror_epi32(w, u, a, b) \ + ((__m256i)__builtin_ia32_selectd_256((__mmask8)(u), \ + (__v8si)_mm256_ror_epi32((a), (b)), \ + (__v8si)(__m256i)(w))) + +#define _mm256_maskz_ror_epi32(u, a, b) \ + ((__m256i)__builtin_ia32_selectd_256((__mmask8)(u), \ + (__v8si)_mm256_ror_epi32((a), (b)), \ + (__v8si)_mm256_setzero_si256())) + +#define _mm_ror_epi64(a, b) \ + ((__m128i)__builtin_ia32_prorq128((__v2di)(__m128i)(a), (int)(b))) + +#define _mm_mask_ror_epi64(w, u, a, b) \ + ((__m128i)__builtin_ia32_selectq_128((__mmask8)(u), \ + (__v2di)_mm_ror_epi64((a), (b)), \ + (__v2di)(__m128i)(w))) + +#define _mm_maskz_ror_epi64(u, a, b) \ + ((__m128i)__builtin_ia32_selectq_128((__mmask8)(u), \ + (__v2di)_mm_ror_epi64((a), (b)), \ + (__v2di)_mm_setzero_si128())) + +#define _mm256_ror_epi64(a, b) \ + ((__m256i)__builtin_ia32_prorq256((__v4di)(__m256i)(a), (int)(b))) + +#define _mm256_mask_ror_epi64(w, u, a, b) \ + ((__m256i)__builtin_ia32_selectq_256((__mmask8)(u), \ + (__v4di)_mm256_ror_epi64((a), (b)), \ + (__v4di)(__m256i)(w))) + +#define _mm256_maskz_ror_epi64(u, a, b) \ + ((__m256i)__builtin_ia32_selectq_256((__mmask8)(u), \ + (__v4di)_mm256_ror_epi64((a), (b)), \ + (__v4di)_mm256_setzero_si256())) + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_sll_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_sll_epi32(__A, __B), + (__v4si)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_sll_epi32(__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_sll_epi32(__A, __B), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_sll_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) +{ + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_sll_epi32(__A, __B), + (__v8si)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_sll_epi32(__mmask8 __U, __m256i __A, __m128i __B) +{ + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_sll_epi32(__A, __B), + (__v8si)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_slli_epi32(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B) +{ + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_slli_epi32(__A, __B), + (__v4si)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_slli_epi32(__mmask8 __U, __m128i __A, unsigned int __B) +{ + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_slli_epi32(__A, __B), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_slli_epi32(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B) +{ + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_slli_epi32(__A, __B), + (__v8si)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_slli_epi32(__mmask8 __U, __m256i __A, unsigned int __B) +{ + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_slli_epi32(__A, __B), + (__v8si)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_sll_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_sll_epi64(__A, __B), + (__v2di)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_sll_epi64(__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_sll_epi64(__A, __B), + (__v2di)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_sll_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) +{ + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_sll_epi64(__A, __B), + (__v4di)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_sll_epi64(__mmask8 __U, __m256i __A, __m128i __B) +{ + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_sll_epi64(__A, __B), + (__v4di)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_slli_epi64(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B) +{ + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_slli_epi64(__A, __B), + (__v2di)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_slli_epi64(__mmask8 __U, __m128i __A, unsigned int __B) +{ + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_slli_epi64(__A, __B), + (__v2di)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_slli_epi64(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B) +{ + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_slli_epi64(__A, __B), + (__v4di)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_slli_epi64(__mmask8 __U, __m256i __A, unsigned int __B) +{ + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_slli_epi64(__A, __B), + (__v4di)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_rorv_epi32 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_prorvd128((__v4si)__A, (__v4si)__B); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_rorv_epi32 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectd_128(__U, + (__v4si)_mm_rorv_epi32(__A, __B), + (__v4si)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_rorv_epi32 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectd_128(__U, + (__v4si)_mm_rorv_epi32(__A, __B), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_rorv_epi32 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_prorvd256((__v8si)__A, (__v8si)__B); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_rorv_epi32 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectd_256(__U, + (__v8si)_mm256_rorv_epi32(__A, __B), + (__v8si)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_rorv_epi32 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectd_256(__U, + (__v8si)_mm256_rorv_epi32(__A, __B), + (__v8si)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_rorv_epi64 (__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_prorvq128((__v2di)__A, (__v2di)__B); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_rorv_epi64 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectq_128(__U, + (__v2di)_mm_rorv_epi64(__A, __B), + (__v2di)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_rorv_epi64 (__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectq_128(__U, + (__v2di)_mm_rorv_epi64(__A, __B), + (__v2di)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_rorv_epi64 (__m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_prorvq256((__v4di)__A, (__v4di)__B); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_rorv_epi64 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectq_256(__U, + (__v4di)_mm256_rorv_epi64(__A, __B), + (__v4di)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_rorv_epi64 (__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectq_256(__U, + (__v4di)_mm256_rorv_epi64(__A, __B), + (__v4di)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_sllv_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) +{ + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_sllv_epi64(__X, __Y), + (__v2di)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_sllv_epi64(__mmask8 __U, __m128i __X, __m128i __Y) +{ + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_sllv_epi64(__X, __Y), + (__v2di)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_sllv_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) +{ + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_sllv_epi64(__X, __Y), + (__v4di)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_sllv_epi64(__mmask8 __U, __m256i __X, __m256i __Y) +{ + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_sllv_epi64(__X, __Y), + (__v4di)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_sllv_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) +{ + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_sllv_epi32(__X, __Y), + (__v4si)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_sllv_epi32(__mmask8 __U, __m128i __X, __m128i __Y) +{ + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_sllv_epi32(__X, __Y), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_sllv_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) +{ + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_sllv_epi32(__X, __Y), + (__v8si)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_sllv_epi32(__mmask8 __U, __m256i __X, __m256i __Y) +{ + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_sllv_epi32(__X, __Y), + (__v8si)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_srlv_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) +{ + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_srlv_epi64(__X, __Y), + (__v2di)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_srlv_epi64(__mmask8 __U, __m128i __X, __m128i __Y) +{ + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_srlv_epi64(__X, __Y), + (__v2di)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_srlv_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) +{ + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_srlv_epi64(__X, __Y), + (__v4di)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_srlv_epi64(__mmask8 __U, __m256i __X, __m256i __Y) +{ + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_srlv_epi64(__X, __Y), + (__v4di)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_srlv_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) +{ + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_srlv_epi32(__X, __Y), + (__v4si)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_srlv_epi32(__mmask8 __U, __m128i __X, __m128i __Y) +{ + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_srlv_epi32(__X, __Y), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_srlv_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) +{ + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_srlv_epi32(__X, __Y), + (__v8si)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_srlv_epi32(__mmask8 __U, __m256i __X, __m256i __Y) +{ + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_srlv_epi32(__X, __Y), + (__v8si)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_srl_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_srl_epi32(__A, __B), + (__v4si)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_srl_epi32(__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_srl_epi32(__A, __B), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_srl_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) +{ + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_srl_epi32(__A, __B), + (__v8si)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_srl_epi32(__mmask8 __U, __m256i __A, __m128i __B) +{ + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_srl_epi32(__A, __B), + (__v8si)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_srli_epi32(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B) +{ + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_srli_epi32(__A, __B), + (__v4si)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_srli_epi32(__mmask8 __U, __m128i __A, unsigned int __B) +{ + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_srli_epi32(__A, __B), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_srli_epi32(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B) +{ + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_srli_epi32(__A, __B), + (__v8si)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_srli_epi32(__mmask8 __U, __m256i __A, unsigned int __B) +{ + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_srli_epi32(__A, __B), + (__v8si)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_srl_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_srl_epi64(__A, __B), + (__v2di)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_srl_epi64(__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_srl_epi64(__A, __B), + (__v2di)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_srl_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) +{ + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_srl_epi64(__A, __B), + (__v4di)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_srl_epi64(__mmask8 __U, __m256i __A, __m128i __B) +{ + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_srl_epi64(__A, __B), + (__v4di)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_srli_epi64(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B) +{ + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_srli_epi64(__A, __B), + (__v2di)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_srli_epi64(__mmask8 __U, __m128i __A, unsigned int __B) +{ + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_srli_epi64(__A, __B), + (__v2di)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_srli_epi64(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B) +{ + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_srli_epi64(__A, __B), + (__v4di)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_srli_epi64(__mmask8 __U, __m256i __A, unsigned int __B) +{ + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_srli_epi64(__A, __B), + (__v4di)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_srav_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) +{ + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_srav_epi32(__X, __Y), + (__v4si)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_srav_epi32(__mmask8 __U, __m128i __X, __m128i __Y) +{ + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_srav_epi32(__X, __Y), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_srav_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) +{ + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_srav_epi32(__X, __Y), + (__v8si)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_srav_epi32(__mmask8 __U, __m256i __X, __m256i __Y) +{ + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_srav_epi32(__X, __Y), + (__v8si)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_srav_epi64(__m128i __X, __m128i __Y) +{ + return (__m128i)__builtin_ia32_psravq128((__v2di)__X, (__v2di)__Y); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_srav_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) +{ + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_srav_epi64(__X, __Y), + (__v2di)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_srav_epi64(__mmask8 __U, __m128i __X, __m128i __Y) +{ + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_srav_epi64(__X, __Y), + (__v2di)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_srav_epi64(__m256i __X, __m256i __Y) +{ + return (__m256i)__builtin_ia32_psravq256((__v4di)__X, (__v4di) __Y); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_srav_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) +{ + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_srav_epi64(__X, __Y), + (__v4di)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_srav_epi64 (__mmask8 __U, __m256i __X, __m256i __Y) +{ + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_srav_epi64(__X, __Y), + (__v4di)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_mov_epi32 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_selectd_128 ((__mmask8) __U, + (__v4si) __A, + (__v4si) __W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_mov_epi32 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_selectd_128 ((__mmask8) __U, + (__v4si) __A, + (__v4si) _mm_setzero_si128 ()); +} + + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_mov_epi32 (__m256i __W, __mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_selectd_256 ((__mmask8) __U, + (__v8si) __A, + (__v8si) __W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_mov_epi32 (__mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_selectd_256 ((__mmask8) __U, + (__v8si) __A, + (__v8si) _mm256_setzero_si256 ()); +} + +static __inline __m128i __DEFAULT_FN_ATTRS128 +_mm_load_epi32 (void const *__P) +{ + return *(const __m128i *) __P; +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_load_epi32 (__m128i __W, __mmask8 __U, void const *__P) +{ + return (__m128i) __builtin_ia32_movdqa32load128_mask ((const __v4si *) __P, + (__v4si) __W, + (__mmask8) + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_load_epi32 (__mmask8 __U, void const *__P) +{ + return (__m128i) __builtin_ia32_movdqa32load128_mask ((const __v4si *) __P, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) + __U); +} + +static __inline __m256i __DEFAULT_FN_ATTRS256 +_mm256_load_epi32 (void const *__P) +{ + return *(const __m256i *) __P; +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_load_epi32 (__m256i __W, __mmask8 __U, void const *__P) +{ + return (__m256i) __builtin_ia32_movdqa32load256_mask ((const __v8si *) __P, + (__v8si) __W, + (__mmask8) + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_load_epi32 (__mmask8 __U, void const *__P) +{ + return (__m256i) __builtin_ia32_movdqa32load256_mask ((const __v8si *) __P, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) + __U); +} + +static __inline void __DEFAULT_FN_ATTRS128 +_mm_store_epi32 (void *__P, __m128i __A) +{ + *(__m128i *) __P = __A; +} + +static __inline__ void __DEFAULT_FN_ATTRS128 +_mm_mask_store_epi32 (void *__P, __mmask8 __U, __m128i __A) +{ + __builtin_ia32_movdqa32store128_mask ((__v4si *) __P, + (__v4si) __A, + (__mmask8) __U); +} + +static __inline void __DEFAULT_FN_ATTRS256 +_mm256_store_epi32 (void *__P, __m256i __A) +{ + *(__m256i *) __P = __A; +} + +static __inline__ void __DEFAULT_FN_ATTRS256 +_mm256_mask_store_epi32 (void *__P, __mmask8 __U, __m256i __A) +{ + __builtin_ia32_movdqa32store256_mask ((__v8si *) __P, + (__v8si) __A, + (__mmask8) __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_mov_epi64 (__m128i __W, __mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_selectq_128 ((__mmask8) __U, + (__v2di) __A, + (__v2di) __W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_mov_epi64 (__mmask8 __U, __m128i __A) +{ + return (__m128i) __builtin_ia32_selectq_128 ((__mmask8) __U, + (__v2di) __A, + (__v2di) _mm_setzero_si128 ()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_mov_epi64 (__m256i __W, __mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_selectq_256 ((__mmask8) __U, + (__v4di) __A, + (__v4di) __W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_mov_epi64 (__mmask8 __U, __m256i __A) +{ + return (__m256i) __builtin_ia32_selectq_256 ((__mmask8) __U, + (__v4di) __A, + (__v4di) _mm256_setzero_si256 ()); +} + +static __inline __m128i __DEFAULT_FN_ATTRS128 +_mm_load_epi64 (void const *__P) +{ + return *(const __m128i *) __P; +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_load_epi64 (__m128i __W, __mmask8 __U, void const *__P) +{ + return (__m128i) __builtin_ia32_movdqa64load128_mask ((const __v2di *) __P, + (__v2di) __W, + (__mmask8) + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_load_epi64 (__mmask8 __U, void const *__P) +{ + return (__m128i) __builtin_ia32_movdqa64load128_mask ((const __v2di *) __P, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) + __U); +} + +static __inline __m256i __DEFAULT_FN_ATTRS256 +_mm256_load_epi64 (void const *__P) +{ + return *(const __m256i *) __P; +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_load_epi64 (__m256i __W, __mmask8 __U, void const *__P) +{ + return (__m256i) __builtin_ia32_movdqa64load256_mask ((const __v4di *) __P, + (__v4di) __W, + (__mmask8) + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_load_epi64 (__mmask8 __U, void const *__P) +{ + return (__m256i) __builtin_ia32_movdqa64load256_mask ((const __v4di *) __P, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) + __U); +} + +static __inline void __DEFAULT_FN_ATTRS128 +_mm_store_epi64 (void *__P, __m128i __A) +{ + *(__m128i *) __P = __A; +} + +static __inline__ void __DEFAULT_FN_ATTRS128 +_mm_mask_store_epi64 (void *__P, __mmask8 __U, __m128i __A) +{ + __builtin_ia32_movdqa64store128_mask ((__v2di *) __P, + (__v2di) __A, + (__mmask8) __U); +} + +static __inline void __DEFAULT_FN_ATTRS256 +_mm256_store_epi64 (void *__P, __m256i __A) +{ + *(__m256i *) __P = __A; +} + +static __inline__ void __DEFAULT_FN_ATTRS256 +_mm256_mask_store_epi64 (void *__P, __mmask8 __U, __m256i __A) +{ + __builtin_ia32_movdqa64store256_mask ((__v4di *) __P, + (__v4di) __A, + (__mmask8) __U); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask_movedup_pd (__m128d __W, __mmask8 __U, __m128d __A) +{ + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_movedup_pd(__A), + (__v2df)__W); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_maskz_movedup_pd (__mmask8 __U, __m128d __A) +{ + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_movedup_pd(__A), + (__v2df)_mm_setzero_pd()); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_mask_movedup_pd (__m256d __W, __mmask8 __U, __m256d __A) +{ + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_movedup_pd(__A), + (__v4df)__W); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_maskz_movedup_pd (__mmask8 __U, __m256d __A) +{ + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_movedup_pd(__A), + (__v4df)_mm256_setzero_pd()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_set1_epi32(__m128i __O, __mmask8 __M, int __A) +{ + return (__m128i)__builtin_ia32_selectd_128(__M, + (__v4si) _mm_set1_epi32(__A), + (__v4si)__O); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_set1_epi32( __mmask8 __M, int __A) +{ + return (__m128i)__builtin_ia32_selectd_128(__M, + (__v4si) _mm_set1_epi32(__A), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_set1_epi32(__m256i __O, __mmask8 __M, int __A) +{ + return (__m256i)__builtin_ia32_selectd_256(__M, + (__v8si) _mm256_set1_epi32(__A), + (__v8si)__O); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_set1_epi32( __mmask8 __M, int __A) +{ + return (__m256i)__builtin_ia32_selectd_256(__M, + (__v8si) _mm256_set1_epi32(__A), + (__v8si)_mm256_setzero_si256()); +} + + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_set1_epi64 (__m128i __O, __mmask8 __M, long long __A) +{ + return (__m128i) __builtin_ia32_selectq_128(__M, + (__v2di) _mm_set1_epi64x(__A), + (__v2di) __O); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_set1_epi64 (__mmask8 __M, long long __A) +{ + return (__m128i) __builtin_ia32_selectq_128(__M, + (__v2di) _mm_set1_epi64x(__A), + (__v2di) _mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_set1_epi64 (__m256i __O, __mmask8 __M, long long __A) +{ + return (__m256i) __builtin_ia32_selectq_256(__M, + (__v4di) _mm256_set1_epi64x(__A), + (__v4di) __O) ; +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_set1_epi64 (__mmask8 __M, long long __A) +{ + return (__m256i) __builtin_ia32_selectq_256(__M, + (__v4di) _mm256_set1_epi64x(__A), + (__v4di) _mm256_setzero_si256()); +} + +#define _mm_fixupimm_pd(A, B, C, imm) \ + ((__m128d)__builtin_ia32_fixupimmpd128_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2di)(__m128i)(C), (int)(imm), \ + (__mmask8)-1)) + +#define _mm_mask_fixupimm_pd(A, U, B, C, imm) \ + ((__m128d)__builtin_ia32_fixupimmpd128_mask((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2di)(__m128i)(C), (int)(imm), \ + (__mmask8)(U))) + +#define _mm_maskz_fixupimm_pd(U, A, B, C, imm) \ + ((__m128d)__builtin_ia32_fixupimmpd128_maskz((__v2df)(__m128d)(A), \ + (__v2df)(__m128d)(B), \ + (__v2di)(__m128i)(C), \ + (int)(imm), (__mmask8)(U))) + +#define _mm256_fixupimm_pd(A, B, C, imm) \ + ((__m256d)__builtin_ia32_fixupimmpd256_mask((__v4df)(__m256d)(A), \ + (__v4df)(__m256d)(B), \ + (__v4di)(__m256i)(C), (int)(imm), \ + (__mmask8)-1)) + +#define _mm256_mask_fixupimm_pd(A, U, B, C, imm) \ + ((__m256d)__builtin_ia32_fixupimmpd256_mask((__v4df)(__m256d)(A), \ + (__v4df)(__m256d)(B), \ + (__v4di)(__m256i)(C), (int)(imm), \ + (__mmask8)(U))) + +#define _mm256_maskz_fixupimm_pd(U, A, B, C, imm) \ + ((__m256d)__builtin_ia32_fixupimmpd256_maskz((__v4df)(__m256d)(A), \ + (__v4df)(__m256d)(B), \ + (__v4di)(__m256i)(C), \ + (int)(imm), (__mmask8)(U))) + +#define _mm_fixupimm_ps(A, B, C, imm) \ + ((__m128)__builtin_ia32_fixupimmps128_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4si)(__m128i)(C), (int)(imm), \ + (__mmask8)-1)) + +#define _mm_mask_fixupimm_ps(A, U, B, C, imm) \ + ((__m128)__builtin_ia32_fixupimmps128_mask((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4si)(__m128i)(C), (int)(imm), \ + (__mmask8)(U))) + +#define _mm_maskz_fixupimm_ps(U, A, B, C, imm) \ + ((__m128)__builtin_ia32_fixupimmps128_maskz((__v4sf)(__m128)(A), \ + (__v4sf)(__m128)(B), \ + (__v4si)(__m128i)(C), (int)(imm), \ + (__mmask8)(U))) + +#define _mm256_fixupimm_ps(A, B, C, imm) \ + ((__m256)__builtin_ia32_fixupimmps256_mask((__v8sf)(__m256)(A), \ + (__v8sf)(__m256)(B), \ + (__v8si)(__m256i)(C), (int)(imm), \ + (__mmask8)-1)) + +#define _mm256_mask_fixupimm_ps(A, U, B, C, imm) \ + ((__m256)__builtin_ia32_fixupimmps256_mask((__v8sf)(__m256)(A), \ + (__v8sf)(__m256)(B), \ + (__v8si)(__m256i)(C), (int)(imm), \ + (__mmask8)(U))) + +#define _mm256_maskz_fixupimm_ps(U, A, B, C, imm) \ + ((__m256)__builtin_ia32_fixupimmps256_maskz((__v8sf)(__m256)(A), \ + (__v8sf)(__m256)(B), \ + (__v8si)(__m256i)(C), (int)(imm), \ + (__mmask8)(U))) + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask_load_pd (__m128d __W, __mmask8 __U, void const *__P) +{ + return (__m128d) __builtin_ia32_loadapd128_mask ((const __v2df *) __P, + (__v2df) __W, + (__mmask8) __U); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_maskz_load_pd (__mmask8 __U, void const *__P) +{ + return (__m128d) __builtin_ia32_loadapd128_mask ((const __v2df *) __P, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_mask_load_pd (__m256d __W, __mmask8 __U, void const *__P) +{ + return (__m256d) __builtin_ia32_loadapd256_mask ((const __v4df *) __P, + (__v4df) __W, + (__mmask8) __U); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_maskz_load_pd (__mmask8 __U, void const *__P) +{ + return (__m256d) __builtin_ia32_loadapd256_mask ((const __v4df *) __P, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask_load_ps (__m128 __W, __mmask8 __U, void const *__P) +{ + return (__m128) __builtin_ia32_loadaps128_mask ((const __v4sf *) __P, + (__v4sf) __W, + (__mmask8) __U); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_load_ps (__mmask8 __U, void const *__P) +{ + return (__m128) __builtin_ia32_loadaps128_mask ((const __v4sf *) __P, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_mask_load_ps (__m256 __W, __mmask8 __U, void const *__P) +{ + return (__m256) __builtin_ia32_loadaps256_mask ((const __v8sf *) __P, + (__v8sf) __W, + (__mmask8) __U); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_maskz_load_ps (__mmask8 __U, void const *__P) +{ + return (__m256) __builtin_ia32_loadaps256_mask ((const __v8sf *) __P, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} + +static __inline __m128i __DEFAULT_FN_ATTRS128 +_mm_loadu_epi64 (void const *__P) +{ + struct __loadu_epi64 { + __m128i_u __v; + } __attribute__((__packed__, __may_alias__)); + return ((const struct __loadu_epi64*)__P)->__v; +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_loadu_epi64 (__m128i __W, __mmask8 __U, void const *__P) +{ + return (__m128i) __builtin_ia32_loaddqudi128_mask ((const __v2di *) __P, + (__v2di) __W, + (__mmask8) __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_loadu_epi64 (__mmask8 __U, void const *__P) +{ + return (__m128i) __builtin_ia32_loaddqudi128_mask ((const __v2di *) __P, + (__v2di) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +static __inline __m256i __DEFAULT_FN_ATTRS256 +_mm256_loadu_epi64 (void const *__P) +{ + struct __loadu_epi64 { + __m256i_u __v; + } __attribute__((__packed__, __may_alias__)); + return ((const struct __loadu_epi64*)__P)->__v; +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_loadu_epi64 (__m256i __W, __mmask8 __U, void const *__P) +{ + return (__m256i) __builtin_ia32_loaddqudi256_mask ((const __v4di *) __P, + (__v4di) __W, + (__mmask8) __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_loadu_epi64 (__mmask8 __U, void const *__P) +{ + return (__m256i) __builtin_ia32_loaddqudi256_mask ((const __v4di *) __P, + (__v4di) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +static __inline __m128i __DEFAULT_FN_ATTRS128 +_mm_loadu_epi32 (void const *__P) +{ + struct __loadu_epi32 { + __m128i_u __v; + } __attribute__((__packed__, __may_alias__)); + return ((const struct __loadu_epi32*)__P)->__v; +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_loadu_epi32 (__m128i __W, __mmask8 __U, void const *__P) +{ + return (__m128i) __builtin_ia32_loaddqusi128_mask ((const __v4si *) __P, + (__v4si) __W, + (__mmask8) __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_loadu_epi32 (__mmask8 __U, void const *__P) +{ + return (__m128i) __builtin_ia32_loaddqusi128_mask ((const __v4si *) __P, + (__v4si) + _mm_setzero_si128 (), + (__mmask8) __U); +} + +static __inline __m256i __DEFAULT_FN_ATTRS256 +_mm256_loadu_epi32 (void const *__P) +{ + struct __loadu_epi32 { + __m256i_u __v; + } __attribute__((__packed__, __may_alias__)); + return ((const struct __loadu_epi32*)__P)->__v; +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_loadu_epi32 (__m256i __W, __mmask8 __U, void const *__P) +{ + return (__m256i) __builtin_ia32_loaddqusi256_mask ((const __v8si *) __P, + (__v8si) __W, + (__mmask8) __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_loadu_epi32 (__mmask8 __U, void const *__P) +{ + return (__m256i) __builtin_ia32_loaddqusi256_mask ((const __v8si *) __P, + (__v8si) + _mm256_setzero_si256 (), + (__mmask8) __U); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask_loadu_pd (__m128d __W, __mmask8 __U, void const *__P) +{ + return (__m128d) __builtin_ia32_loadupd128_mask ((const __v2df *) __P, + (__v2df) __W, + (__mmask8) __U); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_maskz_loadu_pd (__mmask8 __U, void const *__P) +{ + return (__m128d) __builtin_ia32_loadupd128_mask ((const __v2df *) __P, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_mask_loadu_pd (__m256d __W, __mmask8 __U, void const *__P) +{ + return (__m256d) __builtin_ia32_loadupd256_mask ((const __v4df *) __P, + (__v4df) __W, + (__mmask8) __U); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_maskz_loadu_pd (__mmask8 __U, void const *__P) +{ + return (__m256d) __builtin_ia32_loadupd256_mask ((const __v4df *) __P, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask_loadu_ps (__m128 __W, __mmask8 __U, void const *__P) +{ + return (__m128) __builtin_ia32_loadups128_mask ((const __v4sf *) __P, + (__v4sf) __W, + (__mmask8) __U); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_loadu_ps (__mmask8 __U, void const *__P) +{ + return (__m128) __builtin_ia32_loadups128_mask ((const __v4sf *) __P, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_mask_loadu_ps (__m256 __W, __mmask8 __U, void const *__P) +{ + return (__m256) __builtin_ia32_loadups256_mask ((const __v8sf *) __P, + (__v8sf) __W, + (__mmask8) __U); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_maskz_loadu_ps (__mmask8 __U, void const *__P) +{ + return (__m256) __builtin_ia32_loadups256_mask ((const __v8sf *) __P, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} + +static __inline__ void __DEFAULT_FN_ATTRS128 +_mm_mask_store_pd (void *__P, __mmask8 __U, __m128d __A) +{ + __builtin_ia32_storeapd128_mask ((__v2df *) __P, + (__v2df) __A, + (__mmask8) __U); +} + +static __inline__ void __DEFAULT_FN_ATTRS256 +_mm256_mask_store_pd (void *__P, __mmask8 __U, __m256d __A) +{ + __builtin_ia32_storeapd256_mask ((__v4df *) __P, + (__v4df) __A, + (__mmask8) __U); +} + +static __inline__ void __DEFAULT_FN_ATTRS128 +_mm_mask_store_ps (void *__P, __mmask8 __U, __m128 __A) +{ + __builtin_ia32_storeaps128_mask ((__v4sf *) __P, + (__v4sf) __A, + (__mmask8) __U); +} + +static __inline__ void __DEFAULT_FN_ATTRS256 +_mm256_mask_store_ps (void *__P, __mmask8 __U, __m256 __A) +{ + __builtin_ia32_storeaps256_mask ((__v8sf *) __P, + (__v8sf) __A, + (__mmask8) __U); +} + +static __inline void __DEFAULT_FN_ATTRS128 +_mm_storeu_epi64 (void *__P, __m128i __A) +{ + struct __storeu_epi64 { + __m128i_u __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_epi64*)__P)->__v = __A; +} + +static __inline__ void __DEFAULT_FN_ATTRS128 +_mm_mask_storeu_epi64 (void *__P, __mmask8 __U, __m128i __A) +{ + __builtin_ia32_storedqudi128_mask ((__v2di *) __P, + (__v2di) __A, + (__mmask8) __U); +} + +static __inline void __DEFAULT_FN_ATTRS256 +_mm256_storeu_epi64 (void *__P, __m256i __A) +{ + struct __storeu_epi64 { + __m256i_u __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_epi64*)__P)->__v = __A; +} + +static __inline__ void __DEFAULT_FN_ATTRS256 +_mm256_mask_storeu_epi64 (void *__P, __mmask8 __U, __m256i __A) +{ + __builtin_ia32_storedqudi256_mask ((__v4di *) __P, + (__v4di) __A, + (__mmask8) __U); +} + +static __inline void __DEFAULT_FN_ATTRS128 +_mm_storeu_epi32 (void *__P, __m128i __A) +{ + struct __storeu_epi32 { + __m128i_u __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_epi32*)__P)->__v = __A; +} + +static __inline__ void __DEFAULT_FN_ATTRS128 +_mm_mask_storeu_epi32 (void *__P, __mmask8 __U, __m128i __A) +{ + __builtin_ia32_storedqusi128_mask ((__v4si *) __P, + (__v4si) __A, + (__mmask8) __U); +} + +static __inline void __DEFAULT_FN_ATTRS256 +_mm256_storeu_epi32 (void *__P, __m256i __A) +{ + struct __storeu_epi32 { + __m256i_u __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_epi32*)__P)->__v = __A; +} + +static __inline__ void __DEFAULT_FN_ATTRS256 +_mm256_mask_storeu_epi32 (void *__P, __mmask8 __U, __m256i __A) +{ + __builtin_ia32_storedqusi256_mask ((__v8si *) __P, + (__v8si) __A, + (__mmask8) __U); +} + +static __inline__ void __DEFAULT_FN_ATTRS128 +_mm_mask_storeu_pd (void *__P, __mmask8 __U, __m128d __A) +{ + __builtin_ia32_storeupd128_mask ((__v2df *) __P, + (__v2df) __A, + (__mmask8) __U); +} + +static __inline__ void __DEFAULT_FN_ATTRS256 +_mm256_mask_storeu_pd (void *__P, __mmask8 __U, __m256d __A) +{ + __builtin_ia32_storeupd256_mask ((__v4df *) __P, + (__v4df) __A, + (__mmask8) __U); +} + +static __inline__ void __DEFAULT_FN_ATTRS128 +_mm_mask_storeu_ps (void *__P, __mmask8 __U, __m128 __A) +{ + __builtin_ia32_storeups128_mask ((__v4sf *) __P, + (__v4sf) __A, + (__mmask8) __U); +} + +static __inline__ void __DEFAULT_FN_ATTRS256 +_mm256_mask_storeu_ps (void *__P, __mmask8 __U, __m256 __A) +{ + __builtin_ia32_storeups256_mask ((__v8sf *) __P, + (__v8sf) __A, + (__mmask8) __U); +} + + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask_unpackhi_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_unpackhi_pd(__A, __B), + (__v2df)__W); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_maskz_unpackhi_pd(__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_unpackhi_pd(__A, __B), + (__v2df)_mm_setzero_pd()); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_mask_unpackhi_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) +{ + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_unpackhi_pd(__A, __B), + (__v4df)__W); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_maskz_unpackhi_pd(__mmask8 __U, __m256d __A, __m256d __B) +{ + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_unpackhi_pd(__A, __B), + (__v4df)_mm256_setzero_pd()); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask_unpackhi_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_unpackhi_ps(__A, __B), + (__v4sf)__W); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_unpackhi_ps(__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_unpackhi_ps(__A, __B), + (__v4sf)_mm_setzero_ps()); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_mask_unpackhi_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) +{ + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_unpackhi_ps(__A, __B), + (__v8sf)__W); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_maskz_unpackhi_ps(__mmask8 __U, __m256 __A, __m256 __B) +{ + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_unpackhi_ps(__A, __B), + (__v8sf)_mm256_setzero_ps()); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask_unpacklo_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_unpacklo_pd(__A, __B), + (__v2df)__W); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_maskz_unpacklo_pd(__mmask8 __U, __m128d __A, __m128d __B) +{ + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_unpacklo_pd(__A, __B), + (__v2df)_mm_setzero_pd()); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_mask_unpacklo_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) +{ + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_unpacklo_pd(__A, __B), + (__v4df)__W); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_maskz_unpacklo_pd(__mmask8 __U, __m256d __A, __m256d __B) +{ + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_unpacklo_pd(__A, __B), + (__v4df)_mm256_setzero_pd()); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask_unpacklo_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_unpacklo_ps(__A, __B), + (__v4sf)__W); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_unpacklo_ps(__mmask8 __U, __m128 __A, __m128 __B) +{ + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_unpacklo_ps(__A, __B), + (__v4sf)_mm_setzero_ps()); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_mask_unpacklo_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) +{ + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_unpacklo_ps(__A, __B), + (__v8sf)__W); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_maskz_unpacklo_ps(__mmask8 __U, __m256 __A, __m256 __B) +{ + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_unpacklo_ps(__A, __B), + (__v8sf)_mm256_setzero_ps()); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_rcp14_pd (__m128d __A) +{ + return (__m128d) __builtin_ia32_rcp14pd128_mask ((__v2df) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask_rcp14_pd (__m128d __W, __mmask8 __U, __m128d __A) +{ + return (__m128d) __builtin_ia32_rcp14pd128_mask ((__v2df) __A, + (__v2df) __W, + (__mmask8) __U); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_maskz_rcp14_pd (__mmask8 __U, __m128d __A) +{ + return (__m128d) __builtin_ia32_rcp14pd128_mask ((__v2df) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_rcp14_pd (__m256d __A) +{ + return (__m256d) __builtin_ia32_rcp14pd256_mask ((__v4df) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) -1); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_mask_rcp14_pd (__m256d __W, __mmask8 __U, __m256d __A) +{ + return (__m256d) __builtin_ia32_rcp14pd256_mask ((__v4df) __A, + (__v4df) __W, + (__mmask8) __U); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_maskz_rcp14_pd (__mmask8 __U, __m256d __A) +{ + return (__m256d) __builtin_ia32_rcp14pd256_mask ((__v4df) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_rcp14_ps (__m128 __A) +{ + return (__m128) __builtin_ia32_rcp14ps128_mask ((__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask_rcp14_ps (__m128 __W, __mmask8 __U, __m128 __A) +{ + return (__m128) __builtin_ia32_rcp14ps128_mask ((__v4sf) __A, + (__v4sf) __W, + (__mmask8) __U); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_rcp14_ps (__mmask8 __U, __m128 __A) +{ + return (__m128) __builtin_ia32_rcp14ps128_mask ((__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_rcp14_ps (__m256 __A) +{ + return (__m256) __builtin_ia32_rcp14ps256_mask ((__v8sf) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) -1); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_mask_rcp14_ps (__m256 __W, __mmask8 __U, __m256 __A) +{ + return (__m256) __builtin_ia32_rcp14ps256_mask ((__v8sf) __A, + (__v8sf) __W, + (__mmask8) __U); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_maskz_rcp14_ps (__mmask8 __U, __m256 __A) +{ + return (__m256) __builtin_ia32_rcp14ps256_mask ((__v8sf) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} + +#define _mm_mask_permute_pd(W, U, X, C) \ + ((__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \ + (__v2df)_mm_permute_pd((X), (C)), \ + (__v2df)(__m128d)(W))) + +#define _mm_maskz_permute_pd(U, X, C) \ + ((__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \ + (__v2df)_mm_permute_pd((X), (C)), \ + (__v2df)_mm_setzero_pd())) + +#define _mm256_mask_permute_pd(W, U, X, C) \ + ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ + (__v4df)_mm256_permute_pd((X), (C)), \ + (__v4df)(__m256d)(W))) + +#define _mm256_maskz_permute_pd(U, X, C) \ + ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ + (__v4df)_mm256_permute_pd((X), (C)), \ + (__v4df)_mm256_setzero_pd())) + +#define _mm_mask_permute_ps(W, U, X, C) \ + ((__m128)__builtin_ia32_selectps_128((__mmask8)(U), \ + (__v4sf)_mm_permute_ps((X), (C)), \ + (__v4sf)(__m128)(W))) + +#define _mm_maskz_permute_ps(U, X, C) \ + ((__m128)__builtin_ia32_selectps_128((__mmask8)(U), \ + (__v4sf)_mm_permute_ps((X), (C)), \ + (__v4sf)_mm_setzero_ps())) + +#define _mm256_mask_permute_ps(W, U, X, C) \ + ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ + (__v8sf)_mm256_permute_ps((X), (C)), \ + (__v8sf)(__m256)(W))) + +#define _mm256_maskz_permute_ps(U, X, C) \ + ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ + (__v8sf)_mm256_permute_ps((X), (C)), \ + (__v8sf)_mm256_setzero_ps())) + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask_permutevar_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128i __C) +{ + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_permutevar_pd(__A, __C), + (__v2df)__W); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_maskz_permutevar_pd(__mmask8 __U, __m128d __A, __m128i __C) +{ + return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, + (__v2df)_mm_permutevar_pd(__A, __C), + (__v2df)_mm_setzero_pd()); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_mask_permutevar_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256i __C) +{ + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_permutevar_pd(__A, __C), + (__v4df)__W); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_maskz_permutevar_pd(__mmask8 __U, __m256d __A, __m256i __C) +{ + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_permutevar_pd(__A, __C), + (__v4df)_mm256_setzero_pd()); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask_permutevar_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128i __C) +{ + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_permutevar_ps(__A, __C), + (__v4sf)__W); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_permutevar_ps(__mmask8 __U, __m128 __A, __m128i __C) +{ + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_permutevar_ps(__A, __C), + (__v4sf)_mm_setzero_ps()); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_mask_permutevar_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256i __C) +{ + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_permutevar_ps(__A, __C), + (__v8sf)__W); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_maskz_permutevar_ps(__mmask8 __U, __m256 __A, __m256i __C) +{ + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_permutevar_ps(__A, __C), + (__v8sf)_mm256_setzero_ps()); +} + +static __inline__ __mmask8 __DEFAULT_FN_ATTRS128 +_mm_test_epi32_mask (__m128i __A, __m128i __B) +{ + return _mm_cmpneq_epi32_mask (_mm_and_si128 (__A, __B), _mm_setzero_si128()); +} + +static __inline__ __mmask8 __DEFAULT_FN_ATTRS128 +_mm_mask_test_epi32_mask (__mmask8 __U, __m128i __A, __m128i __B) +{ + return _mm_mask_cmpneq_epi32_mask (__U, _mm_and_si128 (__A, __B), + _mm_setzero_si128()); +} + +static __inline__ __mmask8 __DEFAULT_FN_ATTRS256 +_mm256_test_epi32_mask (__m256i __A, __m256i __B) +{ + return _mm256_cmpneq_epi32_mask (_mm256_and_si256 (__A, __B), + _mm256_setzero_si256()); +} + +static __inline__ __mmask8 __DEFAULT_FN_ATTRS256 +_mm256_mask_test_epi32_mask (__mmask8 __U, __m256i __A, __m256i __B) +{ + return _mm256_mask_cmpneq_epi32_mask (__U, _mm256_and_si256 (__A, __B), + _mm256_setzero_si256()); +} + +static __inline__ __mmask8 __DEFAULT_FN_ATTRS128 +_mm_test_epi64_mask (__m128i __A, __m128i __B) +{ + return _mm_cmpneq_epi64_mask (_mm_and_si128 (__A, __B), _mm_setzero_si128()); +} + +static __inline__ __mmask8 __DEFAULT_FN_ATTRS128 +_mm_mask_test_epi64_mask (__mmask8 __U, __m128i __A, __m128i __B) +{ + return _mm_mask_cmpneq_epi64_mask (__U, _mm_and_si128 (__A, __B), + _mm_setzero_si128()); +} + +static __inline__ __mmask8 __DEFAULT_FN_ATTRS256 +_mm256_test_epi64_mask (__m256i __A, __m256i __B) +{ + return _mm256_cmpneq_epi64_mask (_mm256_and_si256 (__A, __B), + _mm256_setzero_si256()); +} + +static __inline__ __mmask8 __DEFAULT_FN_ATTRS256 +_mm256_mask_test_epi64_mask (__mmask8 __U, __m256i __A, __m256i __B) +{ + return _mm256_mask_cmpneq_epi64_mask (__U, _mm256_and_si256 (__A, __B), + _mm256_setzero_si256()); +} + +static __inline__ __mmask8 __DEFAULT_FN_ATTRS128 +_mm_testn_epi32_mask (__m128i __A, __m128i __B) +{ + return _mm_cmpeq_epi32_mask (_mm_and_si128 (__A, __B), _mm_setzero_si128()); +} + +static __inline__ __mmask8 __DEFAULT_FN_ATTRS128 +_mm_mask_testn_epi32_mask (__mmask8 __U, __m128i __A, __m128i __B) +{ + return _mm_mask_cmpeq_epi32_mask (__U, _mm_and_si128 (__A, __B), + _mm_setzero_si128()); +} + +static __inline__ __mmask8 __DEFAULT_FN_ATTRS256 +_mm256_testn_epi32_mask (__m256i __A, __m256i __B) +{ + return _mm256_cmpeq_epi32_mask (_mm256_and_si256 (__A, __B), + _mm256_setzero_si256()); +} + +static __inline__ __mmask8 __DEFAULT_FN_ATTRS256 +_mm256_mask_testn_epi32_mask (__mmask8 __U, __m256i __A, __m256i __B) +{ + return _mm256_mask_cmpeq_epi32_mask (__U, _mm256_and_si256 (__A, __B), + _mm256_setzero_si256()); +} + +static __inline__ __mmask8 __DEFAULT_FN_ATTRS128 +_mm_testn_epi64_mask (__m128i __A, __m128i __B) +{ + return _mm_cmpeq_epi64_mask (_mm_and_si128 (__A, __B), _mm_setzero_si128()); +} + +static __inline__ __mmask8 __DEFAULT_FN_ATTRS128 +_mm_mask_testn_epi64_mask (__mmask8 __U, __m128i __A, __m128i __B) +{ + return _mm_mask_cmpeq_epi64_mask (__U, _mm_and_si128 (__A, __B), + _mm_setzero_si128()); +} + +static __inline__ __mmask8 __DEFAULT_FN_ATTRS256 +_mm256_testn_epi64_mask (__m256i __A, __m256i __B) +{ + return _mm256_cmpeq_epi64_mask (_mm256_and_si256 (__A, __B), + _mm256_setzero_si256()); +} + +static __inline__ __mmask8 __DEFAULT_FN_ATTRS256 +_mm256_mask_testn_epi64_mask (__mmask8 __U, __m256i __A, __m256i __B) +{ + return _mm256_mask_cmpeq_epi64_mask (__U, _mm256_and_si256 (__A, __B), + _mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_unpackhi_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_unpackhi_epi32(__A, __B), + (__v4si)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_unpackhi_epi32(__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_unpackhi_epi32(__A, __B), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_unpackhi_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_unpackhi_epi32(__A, __B), + (__v8si)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_unpackhi_epi32(__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_unpackhi_epi32(__A, __B), + (__v8si)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_unpackhi_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_unpackhi_epi64(__A, __B), + (__v2di)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_unpackhi_epi64(__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_unpackhi_epi64(__A, __B), + (__v2di)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_unpackhi_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_unpackhi_epi64(__A, __B), + (__v4di)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_unpackhi_epi64(__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_unpackhi_epi64(__A, __B), + (__v4di)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_unpacklo_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_unpacklo_epi32(__A, __B), + (__v4si)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_unpacklo_epi32(__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_unpacklo_epi32(__A, __B), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_unpacklo_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_unpacklo_epi32(__A, __B), + (__v8si)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_unpacklo_epi32(__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_unpacklo_epi32(__A, __B), + (__v8si)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_unpacklo_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_unpacklo_epi64(__A, __B), + (__v2di)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_unpacklo_epi64(__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, + (__v2di)_mm_unpacklo_epi64(__A, __B), + (__v2di)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_unpacklo_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_unpacklo_epi64(__A, __B), + (__v4di)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_unpacklo_epi64(__mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, + (__v4di)_mm256_unpacklo_epi64(__A, __B), + (__v4di)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_sra_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_sra_epi32(__A, __B), + (__v4si)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_sra_epi32(__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_sra_epi32(__A, __B), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_sra_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) +{ + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_sra_epi32(__A, __B), + (__v8si)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_sra_epi32(__mmask8 __U, __m256i __A, __m128i __B) +{ + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_sra_epi32(__A, __B), + (__v8si)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_srai_epi32(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B) +{ + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_srai_epi32(__A, __B), + (__v4si)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_srai_epi32(__mmask8 __U, __m128i __A, unsigned int __B) +{ + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, + (__v4si)_mm_srai_epi32(__A, __B), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_srai_epi32(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B) +{ + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_srai_epi32(__A, __B), + (__v8si)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_srai_epi32(__mmask8 __U, __m256i __A, unsigned int __B) +{ + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, + (__v8si)_mm256_srai_epi32(__A, __B), + (__v8si)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_sra_epi64(__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_psraq128((__v2di)__A, (__v2di)__B); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_sra_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \ + (__v2di)_mm_sra_epi64(__A, __B), \ + (__v2di)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_sra_epi64(__mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \ + (__v2di)_mm_sra_epi64(__A, __B), \ + (__v2di)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_sra_epi64(__m256i __A, __m128i __B) +{ + return (__m256i)__builtin_ia32_psraq256((__v4di) __A, (__v2di) __B); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_sra_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) +{ + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \ + (__v4di)_mm256_sra_epi64(__A, __B), \ + (__v4di)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_sra_epi64(__mmask8 __U, __m256i __A, __m128i __B) +{ + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \ + (__v4di)_mm256_sra_epi64(__A, __B), \ + (__v4di)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_srai_epi64(__m128i __A, unsigned int __imm) +{ + return (__m128i)__builtin_ia32_psraqi128((__v2di)__A, __imm); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_srai_epi64(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __imm) +{ + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \ + (__v2di)_mm_srai_epi64(__A, __imm), \ + (__v2di)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_srai_epi64(__mmask8 __U, __m128i __A, unsigned int __imm) +{ + return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \ + (__v2di)_mm_srai_epi64(__A, __imm), \ + (__v2di)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_srai_epi64(__m256i __A, unsigned int __imm) +{ + return (__m256i)__builtin_ia32_psraqi256((__v4di)__A, __imm); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_srai_epi64(__m256i __W, __mmask8 __U, __m256i __A, + unsigned int __imm) +{ + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \ + (__v4di)_mm256_srai_epi64(__A, __imm), \ + (__v4di)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_srai_epi64(__mmask8 __U, __m256i __A, unsigned int __imm) +{ + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \ + (__v4di)_mm256_srai_epi64(__A, __imm), \ + (__v4di)_mm256_setzero_si256()); +} + +#define _mm_ternarylogic_epi32(A, B, C, imm) \ + ((__m128i)__builtin_ia32_pternlogd128_mask((__v4si)(__m128i)(A), \ + (__v4si)(__m128i)(B), \ + (__v4si)(__m128i)(C), (int)(imm), \ + (__mmask8)-1)) + +#define _mm_mask_ternarylogic_epi32(A, U, B, C, imm) \ + ((__m128i)__builtin_ia32_pternlogd128_mask((__v4si)(__m128i)(A), \ + (__v4si)(__m128i)(B), \ + (__v4si)(__m128i)(C), (int)(imm), \ + (__mmask8)(U))) + +#define _mm_maskz_ternarylogic_epi32(U, A, B, C, imm) \ + ((__m128i)__builtin_ia32_pternlogd128_maskz((__v4si)(__m128i)(A), \ + (__v4si)(__m128i)(B), \ + (__v4si)(__m128i)(C), (int)(imm), \ + (__mmask8)(U))) + +#define _mm256_ternarylogic_epi32(A, B, C, imm) \ + ((__m256i)__builtin_ia32_pternlogd256_mask((__v8si)(__m256i)(A), \ + (__v8si)(__m256i)(B), \ + (__v8si)(__m256i)(C), (int)(imm), \ + (__mmask8)-1)) + +#define _mm256_mask_ternarylogic_epi32(A, U, B, C, imm) \ + ((__m256i)__builtin_ia32_pternlogd256_mask((__v8si)(__m256i)(A), \ + (__v8si)(__m256i)(B), \ + (__v8si)(__m256i)(C), (int)(imm), \ + (__mmask8)(U))) + +#define _mm256_maskz_ternarylogic_epi32(U, A, B, C, imm) \ + ((__m256i)__builtin_ia32_pternlogd256_maskz((__v8si)(__m256i)(A), \ + (__v8si)(__m256i)(B), \ + (__v8si)(__m256i)(C), (int)(imm), \ + (__mmask8)(U))) + +#define _mm_ternarylogic_epi64(A, B, C, imm) \ + ((__m128i)__builtin_ia32_pternlogq128_mask((__v2di)(__m128i)(A), \ + (__v2di)(__m128i)(B), \ + (__v2di)(__m128i)(C), (int)(imm), \ + (__mmask8)-1)) + +#define _mm_mask_ternarylogic_epi64(A, U, B, C, imm) \ + ((__m128i)__builtin_ia32_pternlogq128_mask((__v2di)(__m128i)(A), \ + (__v2di)(__m128i)(B), \ + (__v2di)(__m128i)(C), (int)(imm), \ + (__mmask8)(U))) + +#define _mm_maskz_ternarylogic_epi64(U, A, B, C, imm) \ + ((__m128i)__builtin_ia32_pternlogq128_maskz((__v2di)(__m128i)(A), \ + (__v2di)(__m128i)(B), \ + (__v2di)(__m128i)(C), (int)(imm), \ + (__mmask8)(U))) + +#define _mm256_ternarylogic_epi64(A, B, C, imm) \ + ((__m256i)__builtin_ia32_pternlogq256_mask((__v4di)(__m256i)(A), \ + (__v4di)(__m256i)(B), \ + (__v4di)(__m256i)(C), (int)(imm), \ + (__mmask8)-1)) + +#define _mm256_mask_ternarylogic_epi64(A, U, B, C, imm) \ + ((__m256i)__builtin_ia32_pternlogq256_mask((__v4di)(__m256i)(A), \ + (__v4di)(__m256i)(B), \ + (__v4di)(__m256i)(C), (int)(imm), \ + (__mmask8)(U))) + +#define _mm256_maskz_ternarylogic_epi64(U, A, B, C, imm) \ + ((__m256i)__builtin_ia32_pternlogq256_maskz((__v4di)(__m256i)(A), \ + (__v4di)(__m256i)(B), \ + (__v4di)(__m256i)(C), (int)(imm), \ + (__mmask8)(U))) + + + +#define _mm256_shuffle_f32x4(A, B, imm) \ + ((__m256)__builtin_ia32_shuf_f32x4_256((__v8sf)(__m256)(A), \ + (__v8sf)(__m256)(B), (int)(imm))) + +#define _mm256_mask_shuffle_f32x4(W, U, A, B, imm) \ + ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ + (__v8sf)_mm256_shuffle_f32x4((A), (B), (imm)), \ + (__v8sf)(__m256)(W))) + +#define _mm256_maskz_shuffle_f32x4(U, A, B, imm) \ + ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ + (__v8sf)_mm256_shuffle_f32x4((A), (B), (imm)), \ + (__v8sf)_mm256_setzero_ps())) + +#define _mm256_shuffle_f64x2(A, B, imm) \ + ((__m256d)__builtin_ia32_shuf_f64x2_256((__v4df)(__m256d)(A), \ + (__v4df)(__m256d)(B), (int)(imm))) + +#define _mm256_mask_shuffle_f64x2(W, U, A, B, imm) \ + ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ + (__v4df)_mm256_shuffle_f64x2((A), (B), (imm)), \ + (__v4df)(__m256d)(W))) + +#define _mm256_maskz_shuffle_f64x2(U, A, B, imm) \ + ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ + (__v4df)_mm256_shuffle_f64x2((A), (B), (imm)), \ + (__v4df)_mm256_setzero_pd())) + +#define _mm256_shuffle_i32x4(A, B, imm) \ + ((__m256i)__builtin_ia32_shuf_i32x4_256((__v8si)(__m256i)(A), \ + (__v8si)(__m256i)(B), (int)(imm))) + +#define _mm256_mask_shuffle_i32x4(W, U, A, B, imm) \ + ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ + (__v8si)_mm256_shuffle_i32x4((A), (B), (imm)), \ + (__v8si)(__m256i)(W))) + +#define _mm256_maskz_shuffle_i32x4(U, A, B, imm) \ + ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ + (__v8si)_mm256_shuffle_i32x4((A), (B), (imm)), \ + (__v8si)_mm256_setzero_si256())) + +#define _mm256_shuffle_i64x2(A, B, imm) \ + ((__m256i)__builtin_ia32_shuf_i64x2_256((__v4di)(__m256i)(A), \ + (__v4di)(__m256i)(B), (int)(imm))) + +#define _mm256_mask_shuffle_i64x2(W, U, A, B, imm) \ + ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ + (__v4di)_mm256_shuffle_i64x2((A), (B), (imm)), \ + (__v4di)(__m256i)(W))) + + +#define _mm256_maskz_shuffle_i64x2(U, A, B, imm) \ + ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ + (__v4di)_mm256_shuffle_i64x2((A), (B), (imm)), \ + (__v4di)_mm256_setzero_si256())) + +#define _mm_mask_shuffle_pd(W, U, A, B, M) \ + ((__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \ + (__v2df)_mm_shuffle_pd((A), (B), (M)), \ + (__v2df)(__m128d)(W))) + +#define _mm_maskz_shuffle_pd(U, A, B, M) \ + ((__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \ + (__v2df)_mm_shuffle_pd((A), (B), (M)), \ + (__v2df)_mm_setzero_pd())) + +#define _mm256_mask_shuffle_pd(W, U, A, B, M) \ + ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ + (__v4df)_mm256_shuffle_pd((A), (B), (M)), \ + (__v4df)(__m256d)(W))) + +#define _mm256_maskz_shuffle_pd(U, A, B, M) \ + ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ + (__v4df)_mm256_shuffle_pd((A), (B), (M)), \ + (__v4df)_mm256_setzero_pd())) + +#define _mm_mask_shuffle_ps(W, U, A, B, M) \ + ((__m128)__builtin_ia32_selectps_128((__mmask8)(U), \ + (__v4sf)_mm_shuffle_ps((A), (B), (M)), \ + (__v4sf)(__m128)(W))) + +#define _mm_maskz_shuffle_ps(U, A, B, M) \ + ((__m128)__builtin_ia32_selectps_128((__mmask8)(U), \ + (__v4sf)_mm_shuffle_ps((A), (B), (M)), \ + (__v4sf)_mm_setzero_ps())) + +#define _mm256_mask_shuffle_ps(W, U, A, B, M) \ + ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ + (__v8sf)_mm256_shuffle_ps((A), (B), (M)), \ + (__v8sf)(__m256)(W))) + +#define _mm256_maskz_shuffle_ps(U, A, B, M) \ + ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ + (__v8sf)_mm256_shuffle_ps((A), (B), (M)), \ + (__v8sf)_mm256_setzero_ps())) + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_rsqrt14_pd (__m128d __A) +{ + return (__m128d) __builtin_ia32_rsqrt14pd128_mask ((__v2df) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) -1); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask_rsqrt14_pd (__m128d __W, __mmask8 __U, __m128d __A) +{ + return (__m128d) __builtin_ia32_rsqrt14pd128_mask ((__v2df) __A, + (__v2df) __W, + (__mmask8) __U); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_maskz_rsqrt14_pd (__mmask8 __U, __m128d __A) +{ + return (__m128d) __builtin_ia32_rsqrt14pd128_mask ((__v2df) __A, + (__v2df) + _mm_setzero_pd (), + (__mmask8) __U); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_rsqrt14_pd (__m256d __A) +{ + return (__m256d) __builtin_ia32_rsqrt14pd256_mask ((__v4df) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) -1); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_mask_rsqrt14_pd (__m256d __W, __mmask8 __U, __m256d __A) +{ + return (__m256d) __builtin_ia32_rsqrt14pd256_mask ((__v4df) __A, + (__v4df) __W, + (__mmask8) __U); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_maskz_rsqrt14_pd (__mmask8 __U, __m256d __A) +{ + return (__m256d) __builtin_ia32_rsqrt14pd256_mask ((__v4df) __A, + (__v4df) + _mm256_setzero_pd (), + (__mmask8) __U); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_rsqrt14_ps (__m128 __A) +{ + return (__m128) __builtin_ia32_rsqrt14ps128_mask ((__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) -1); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask_rsqrt14_ps (__m128 __W, __mmask8 __U, __m128 __A) +{ + return (__m128) __builtin_ia32_rsqrt14ps128_mask ((__v4sf) __A, + (__v4sf) __W, + (__mmask8) __U); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_rsqrt14_ps (__mmask8 __U, __m128 __A) +{ + return (__m128) __builtin_ia32_rsqrt14ps128_mask ((__v4sf) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_rsqrt14_ps (__m256 __A) +{ + return (__m256) __builtin_ia32_rsqrt14ps256_mask ((__v8sf) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) -1); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_mask_rsqrt14_ps (__m256 __W, __mmask8 __U, __m256 __A) +{ + return (__m256) __builtin_ia32_rsqrt14ps256_mask ((__v8sf) __A, + (__v8sf) __W, + (__mmask8) __U); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_maskz_rsqrt14_ps (__mmask8 __U, __m256 __A) +{ + return (__m256) __builtin_ia32_rsqrt14ps256_mask ((__v8sf) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_broadcast_f32x4(__m128 __A) +{ + return (__m256)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A, + 0, 1, 2, 3, 0, 1, 2, 3); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_mask_broadcast_f32x4(__m256 __O, __mmask8 __M, __m128 __A) +{ + return (__m256)__builtin_ia32_selectps_256((__mmask8)__M, + (__v8sf)_mm256_broadcast_f32x4(__A), + (__v8sf)__O); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_maskz_broadcast_f32x4 (__mmask8 __M, __m128 __A) +{ + return (__m256)__builtin_ia32_selectps_256((__mmask8)__M, + (__v8sf)_mm256_broadcast_f32x4(__A), + (__v8sf)_mm256_setzero_ps()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_broadcast_i32x4(__m128i __A) +{ + return (__m256i)__builtin_shufflevector((__v4si)__A, (__v4si)__A, + 0, 1, 2, 3, 0, 1, 2, 3); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_broadcast_i32x4(__m256i __O, __mmask8 __M, __m128i __A) +{ + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, + (__v8si)_mm256_broadcast_i32x4(__A), + (__v8si)__O); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_broadcast_i32x4(__mmask8 __M, __m128i __A) +{ + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, + (__v8si)_mm256_broadcast_i32x4(__A), + (__v8si)_mm256_setzero_si256()); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_mask_broadcastsd_pd (__m256d __O, __mmask8 __M, __m128d __A) +{ + return (__m256d)__builtin_ia32_selectpd_256(__M, + (__v4df) _mm256_broadcastsd_pd(__A), + (__v4df) __O); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A) +{ + return (__m256d)__builtin_ia32_selectpd_256(__M, + (__v4df) _mm256_broadcastsd_pd(__A), + (__v4df) _mm256_setzero_pd()); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask_broadcastss_ps (__m128 __O, __mmask8 __M, __m128 __A) +{ + return (__m128)__builtin_ia32_selectps_128(__M, + (__v4sf) _mm_broadcastss_ps(__A), + (__v4sf) __O); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_broadcastss_ps (__mmask8 __M, __m128 __A) +{ + return (__m128)__builtin_ia32_selectps_128(__M, + (__v4sf) _mm_broadcastss_ps(__A), + (__v4sf) _mm_setzero_ps()); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_mask_broadcastss_ps (__m256 __O, __mmask8 __M, __m128 __A) +{ + return (__m256)__builtin_ia32_selectps_256(__M, + (__v8sf) _mm256_broadcastss_ps(__A), + (__v8sf) __O); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_maskz_broadcastss_ps (__mmask8 __M, __m128 __A) +{ + return (__m256)__builtin_ia32_selectps_256(__M, + (__v8sf) _mm256_broadcastss_ps(__A), + (__v8sf) _mm256_setzero_ps()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_broadcastd_epi32 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i)__builtin_ia32_selectd_128(__M, + (__v4si) _mm_broadcastd_epi32(__A), + (__v4si) __O); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_broadcastd_epi32 (__mmask8 __M, __m128i __A) +{ + return (__m128i)__builtin_ia32_selectd_128(__M, + (__v4si) _mm_broadcastd_epi32(__A), + (__v4si) _mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_broadcastd_epi32 (__m256i __O, __mmask8 __M, __m128i __A) +{ + return (__m256i)__builtin_ia32_selectd_256(__M, + (__v8si) _mm256_broadcastd_epi32(__A), + (__v8si) __O); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_broadcastd_epi32 (__mmask8 __M, __m128i __A) +{ + return (__m256i)__builtin_ia32_selectd_256(__M, + (__v8si) _mm256_broadcastd_epi32(__A), + (__v8si) _mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_broadcastq_epi64 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i)__builtin_ia32_selectq_128(__M, + (__v2di) _mm_broadcastq_epi64(__A), + (__v2di) __O); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A) +{ + return (__m128i)__builtin_ia32_selectq_128(__M, + (__v2di) _mm_broadcastq_epi64(__A), + (__v2di) _mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_broadcastq_epi64 (__m256i __O, __mmask8 __M, __m128i __A) +{ + return (__m256i)__builtin_ia32_selectq_256(__M, + (__v4di) _mm256_broadcastq_epi64(__A), + (__v4di) __O); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A) +{ + return (__m256i)__builtin_ia32_selectq_256(__M, + (__v4di) _mm256_broadcastq_epi64(__A), + (__v4di) _mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_cvtsepi32_epi8 (__m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsdb128_mask ((__v4si) __A, + (__v16qi)_mm_undefined_si128(), + (__mmask8) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvtsepi32_epi8 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsdb128_mask ((__v4si) __A, + (__v16qi) __O, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtsepi32_epi8 (__mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsdb128_mask ((__v4si) __A, + (__v16qi) _mm_setzero_si128 (), + __M); +} + +static __inline__ void __DEFAULT_FN_ATTRS128 +_mm_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A) +{ + __builtin_ia32_pmovsdb128mem_mask ((__v16qi *) __P, (__v4si) __A, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_cvtsepi32_epi8 (__m256i __A) +{ + return (__m128i) __builtin_ia32_pmovsdb256_mask ((__v8si) __A, + (__v16qi)_mm_undefined_si128(), + (__mmask8) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtsepi32_epi8 (__m128i __O, __mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovsdb256_mask ((__v8si) __A, + (__v16qi) __O, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtsepi32_epi8 (__mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovsdb256_mask ((__v8si) __A, + (__v16qi) _mm_setzero_si128 (), + __M); +} + +static __inline__ void __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A) +{ + __builtin_ia32_pmovsdb256mem_mask ((__v16qi *) __P, (__v8si) __A, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_cvtsepi32_epi16 (__m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsdw128_mask ((__v4si) __A, + (__v8hi)_mm_setzero_si128 (), + (__mmask8) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvtsepi32_epi16 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsdw128_mask ((__v4si) __A, + (__v8hi)__O, + __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtsepi32_epi16 (__mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsdw128_mask ((__v4si) __A, + (__v8hi) _mm_setzero_si128 (), + __M); +} + +static __inline__ void __DEFAULT_FN_ATTRS128 +_mm_mask_cvtsepi32_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A) +{ + __builtin_ia32_pmovsdw128mem_mask ((__v8hi *) __P, (__v4si) __A, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_cvtsepi32_epi16 (__m256i __A) +{ + return (__m128i) __builtin_ia32_pmovsdw256_mask ((__v8si) __A, + (__v8hi)_mm_undefined_si128(), + (__mmask8) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtsepi32_epi16 (__m128i __O, __mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovsdw256_mask ((__v8si) __A, + (__v8hi) __O, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtsepi32_epi16 (__mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovsdw256_mask ((__v8si) __A, + (__v8hi) _mm_setzero_si128 (), + __M); +} + +static __inline__ void __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtsepi32_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A) +{ + __builtin_ia32_pmovsdw256mem_mask ((__v8hi *) __P, (__v8si) __A, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_cvtsepi64_epi8 (__m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsqb128_mask ((__v2di) __A, + (__v16qi)_mm_undefined_si128(), + (__mmask8) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsqb128_mask ((__v2di) __A, + (__v16qi) __O, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtsepi64_epi8 (__mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsqb128_mask ((__v2di) __A, + (__v16qi) _mm_setzero_si128 (), + __M); +} + +static __inline__ void __DEFAULT_FN_ATTRS128 +_mm_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A) +{ + __builtin_ia32_pmovsqb128mem_mask ((__v16qi *) __P, (__v2di) __A, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_cvtsepi64_epi8 (__m256i __A) +{ + return (__m128i) __builtin_ia32_pmovsqb256_mask ((__v4di) __A, + (__v16qi)_mm_undefined_si128(), + (__mmask8) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovsqb256_mask ((__v4di) __A, + (__v16qi) __O, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtsepi64_epi8 (__mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovsqb256_mask ((__v4di) __A, + (__v16qi) _mm_setzero_si128 (), + __M); +} + +static __inline__ void __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A) +{ + __builtin_ia32_pmovsqb256mem_mask ((__v16qi *) __P, (__v4di) __A, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_cvtsepi64_epi32 (__m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsqd128_mask ((__v2di) __A, + (__v4si)_mm_undefined_si128(), + (__mmask8) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvtsepi64_epi32 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsqd128_mask ((__v2di) __A, + (__v4si) __O, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtsepi64_epi32 (__mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsqd128_mask ((__v2di) __A, + (__v4si) _mm_setzero_si128 (), + __M); +} + +static __inline__ void __DEFAULT_FN_ATTRS128 +_mm_mask_cvtsepi64_storeu_epi32 (void * __P, __mmask8 __M, __m128i __A) +{ + __builtin_ia32_pmovsqd128mem_mask ((__v4si *) __P, (__v2di) __A, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_cvtsepi64_epi32 (__m256i __A) +{ + return (__m128i) __builtin_ia32_pmovsqd256_mask ((__v4di) __A, + (__v4si)_mm_undefined_si128(), + (__mmask8) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtsepi64_epi32 (__m128i __O, __mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovsqd256_mask ((__v4di) __A, + (__v4si)__O, + __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtsepi64_epi32 (__mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovsqd256_mask ((__v4di) __A, + (__v4si) _mm_setzero_si128 (), + __M); +} + +static __inline__ void __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtsepi64_storeu_epi32 (void * __P, __mmask8 __M, __m256i __A) +{ + __builtin_ia32_pmovsqd256mem_mask ((__v4si *) __P, (__v4di) __A, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_cvtsepi64_epi16 (__m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsqw128_mask ((__v2di) __A, + (__v8hi)_mm_undefined_si128(), + (__mmask8) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsqw128_mask ((__v2di) __A, + (__v8hi) __O, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtsepi64_epi16 (__mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovsqw128_mask ((__v2di) __A, + (__v8hi) _mm_setzero_si128 (), + __M); +} + +static __inline__ void __DEFAULT_FN_ATTRS128 +_mm_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A) +{ + __builtin_ia32_pmovsqw128mem_mask ((__v8hi *) __P, (__v2di) __A, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_cvtsepi64_epi16 (__m256i __A) +{ + return (__m128i) __builtin_ia32_pmovsqw256_mask ((__v4di) __A, + (__v8hi)_mm_undefined_si128(), + (__mmask8) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovsqw256_mask ((__v4di) __A, + (__v8hi) __O, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtsepi64_epi16 (__mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovsqw256_mask ((__v4di) __A, + (__v8hi) _mm_setzero_si128 (), + __M); +} + +static __inline__ void __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A) +{ + __builtin_ia32_pmovsqw256mem_mask ((__v8hi *) __P, (__v4di) __A, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_cvtusepi32_epi8 (__m128i __A) +{ + return (__m128i) __builtin_ia32_pmovusdb128_mask ((__v4si) __A, + (__v16qi)_mm_undefined_si128(), + (__mmask8) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvtusepi32_epi8 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovusdb128_mask ((__v4si) __A, + (__v16qi) __O, + __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtusepi32_epi8 (__mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovusdb128_mask ((__v4si) __A, + (__v16qi) _mm_setzero_si128 (), + __M); +} + +static __inline__ void __DEFAULT_FN_ATTRS128 +_mm_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A) +{ + __builtin_ia32_pmovusdb128mem_mask ((__v16qi *) __P, (__v4si) __A, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_cvtusepi32_epi8 (__m256i __A) +{ + return (__m128i) __builtin_ia32_pmovusdb256_mask ((__v8si) __A, + (__v16qi)_mm_undefined_si128(), + (__mmask8) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtusepi32_epi8 (__m128i __O, __mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovusdb256_mask ((__v8si) __A, + (__v16qi) __O, + __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtusepi32_epi8 (__mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovusdb256_mask ((__v8si) __A, + (__v16qi) _mm_setzero_si128 (), + __M); +} + +static __inline__ void __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A) +{ + __builtin_ia32_pmovusdb256mem_mask ((__v16qi*) __P, (__v8si) __A, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_cvtusepi32_epi16 (__m128i __A) +{ + return (__m128i) __builtin_ia32_pmovusdw128_mask ((__v4si) __A, + (__v8hi)_mm_undefined_si128(), + (__mmask8) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvtusepi32_epi16 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovusdw128_mask ((__v4si) __A, + (__v8hi) __O, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtusepi32_epi16 (__mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovusdw128_mask ((__v4si) __A, + (__v8hi) _mm_setzero_si128 (), + __M); +} + +static __inline__ void __DEFAULT_FN_ATTRS128 +_mm_mask_cvtusepi32_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A) +{ + __builtin_ia32_pmovusdw128mem_mask ((__v8hi *) __P, (__v4si) __A, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_cvtusepi32_epi16 (__m256i __A) +{ + return (__m128i) __builtin_ia32_pmovusdw256_mask ((__v8si) __A, + (__v8hi) _mm_undefined_si128(), + (__mmask8) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtusepi32_epi16 (__m128i __O, __mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovusdw256_mask ((__v8si) __A, + (__v8hi) __O, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtusepi32_epi16 (__mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovusdw256_mask ((__v8si) __A, + (__v8hi) _mm_setzero_si128 (), + __M); +} + +static __inline__ void __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtusepi32_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A) +{ + __builtin_ia32_pmovusdw256mem_mask ((__v8hi *) __P, (__v8si) __A, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_cvtusepi64_epi8 (__m128i __A) +{ + return (__m128i) __builtin_ia32_pmovusqb128_mask ((__v2di) __A, + (__v16qi)_mm_undefined_si128(), + (__mmask8) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovusqb128_mask ((__v2di) __A, + (__v16qi) __O, + __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtusepi64_epi8 (__mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovusqb128_mask ((__v2di) __A, + (__v16qi) _mm_setzero_si128 (), + __M); +} + +static __inline__ void __DEFAULT_FN_ATTRS128 +_mm_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A) +{ + __builtin_ia32_pmovusqb128mem_mask ((__v16qi *) __P, (__v2di) __A, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_cvtusepi64_epi8 (__m256i __A) +{ + return (__m128i) __builtin_ia32_pmovusqb256_mask ((__v4di) __A, + (__v16qi)_mm_undefined_si128(), + (__mmask8) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovusqb256_mask ((__v4di) __A, + (__v16qi) __O, + __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtusepi64_epi8 (__mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovusqb256_mask ((__v4di) __A, + (__v16qi) _mm_setzero_si128 (), + __M); +} + +static __inline__ void __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A) +{ + __builtin_ia32_pmovusqb256mem_mask ((__v16qi *) __P, (__v4di) __A, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_cvtusepi64_epi32 (__m128i __A) +{ + return (__m128i) __builtin_ia32_pmovusqd128_mask ((__v2di) __A, + (__v4si)_mm_undefined_si128(), + (__mmask8) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvtusepi64_epi32 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovusqd128_mask ((__v2di) __A, + (__v4si) __O, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtusepi64_epi32 (__mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovusqd128_mask ((__v2di) __A, + (__v4si) _mm_setzero_si128 (), + __M); +} + +static __inline__ void __DEFAULT_FN_ATTRS128 +_mm_mask_cvtusepi64_storeu_epi32 (void * __P, __mmask8 __M, __m128i __A) +{ + __builtin_ia32_pmovusqd128mem_mask ((__v4si *) __P, (__v2di) __A, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_cvtusepi64_epi32 (__m256i __A) +{ + return (__m128i) __builtin_ia32_pmovusqd256_mask ((__v4di) __A, + (__v4si)_mm_undefined_si128(), + (__mmask8) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtusepi64_epi32 (__m128i __O, __mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovusqd256_mask ((__v4di) __A, + (__v4si) __O, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtusepi64_epi32 (__mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovusqd256_mask ((__v4di) __A, + (__v4si) _mm_setzero_si128 (), + __M); +} + +static __inline__ void __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtusepi64_storeu_epi32 (void * __P, __mmask8 __M, __m256i __A) +{ + __builtin_ia32_pmovusqd256mem_mask ((__v4si *) __P, (__v4di) __A, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_cvtusepi64_epi16 (__m128i __A) +{ + return (__m128i) __builtin_ia32_pmovusqw128_mask ((__v2di) __A, + (__v8hi)_mm_undefined_si128(), + (__mmask8) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovusqw128_mask ((__v2di) __A, + (__v8hi) __O, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtusepi64_epi16 (__mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovusqw128_mask ((__v2di) __A, + (__v8hi) _mm_setzero_si128 (), + __M); +} + +static __inline__ void __DEFAULT_FN_ATTRS128 +_mm_mask_cvtusepi64_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A) +{ + __builtin_ia32_pmovusqw128mem_mask ((__v8hi *) __P, (__v2di) __A, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_cvtusepi64_epi16 (__m256i __A) +{ + return (__m128i) __builtin_ia32_pmovusqw256_mask ((__v4di) __A, + (__v8hi)_mm_undefined_si128(), + (__mmask8) -1); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovusqw256_mask ((__v4di) __A, + (__v8hi) __O, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtusepi64_epi16 (__mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovusqw256_mask ((__v4di) __A, + (__v8hi) _mm_setzero_si128 (), + __M); +} + +static __inline__ void __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtusepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A) +{ + __builtin_ia32_pmovusqw256mem_mask ((__v8hi *) __P, (__v4di) __A, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_cvtepi32_epi8 (__m128i __A) +{ + return (__m128i)__builtin_shufflevector( + __builtin_convertvector((__v4si)__A, __v4qi), (__v4qi){0, 0, 0, 0}, 0, 1, + 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvtepi32_epi8 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovdb128_mask ((__v4si) __A, + (__v16qi) __O, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtepi32_epi8 (__mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovdb128_mask ((__v4si) __A, + (__v16qi) + _mm_setzero_si128 (), + __M); +} + +static __inline__ void __DEFAULT_FN_ATTRS128 +_mm_mask_cvtepi32_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A) +{ + __builtin_ia32_pmovdb128mem_mask ((__v16qi *) __P, (__v4si) __A, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_cvtepi32_epi8 (__m256i __A) +{ + return (__m128i)__builtin_shufflevector( + __builtin_convertvector((__v8si)__A, __v8qi), + (__v8qi){0, 0, 0, 0, 0, 0, 0, 0}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, + 12, 13, 14, 15); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtepi32_epi8 (__m128i __O, __mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovdb256_mask ((__v8si) __A, + (__v16qi) __O, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtepi32_epi8 (__mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovdb256_mask ((__v8si) __A, + (__v16qi) _mm_setzero_si128 (), + __M); +} + +static __inline__ void __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtepi32_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A) +{ + __builtin_ia32_pmovdb256mem_mask ((__v16qi *) __P, (__v8si) __A, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_cvtepi32_epi16 (__m128i __A) +{ + return (__m128i)__builtin_shufflevector( + __builtin_convertvector((__v4si)__A, __v4hi), (__v4hi){0, 0, 0, 0}, 0, 1, + 2, 3, 4, 5, 6, 7); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvtepi32_epi16 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovdw128_mask ((__v4si) __A, + (__v8hi) __O, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtepi32_epi16 (__mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovdw128_mask ((__v4si) __A, + (__v8hi) _mm_setzero_si128 (), + __M); +} + +static __inline__ void __DEFAULT_FN_ATTRS128 +_mm_mask_cvtepi32_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A) +{ + __builtin_ia32_pmovdw128mem_mask ((__v8hi *) __P, (__v4si) __A, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_cvtepi32_epi16 (__m256i __A) +{ + return (__m128i)__builtin_convertvector((__v8si)__A, __v8hi); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtepi32_epi16 (__m128i __O, __mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovdw256_mask ((__v8si) __A, + (__v8hi) __O, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtepi32_epi16 (__mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovdw256_mask ((__v8si) __A, + (__v8hi) _mm_setzero_si128 (), + __M); +} + +static __inline__ void __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtepi32_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A) +{ + __builtin_ia32_pmovdw256mem_mask ((__v8hi *) __P, (__v8si) __A, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_cvtepi64_epi8 (__m128i __A) +{ + return (__m128i)__builtin_shufflevector( + __builtin_convertvector((__v2di)__A, __v2qi), (__v2qi){0, 0}, 0, 1, 2, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovqb128_mask ((__v2di) __A, + (__v16qi) __O, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtepi64_epi8 (__mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovqb128_mask ((__v2di) __A, + (__v16qi) _mm_setzero_si128 (), + __M); +} + +static __inline__ void __DEFAULT_FN_ATTRS128 +_mm_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A) +{ + __builtin_ia32_pmovqb128mem_mask ((__v16qi *) __P, (__v2di) __A, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_cvtepi64_epi8 (__m256i __A) +{ + return (__m128i)__builtin_shufflevector( + __builtin_convertvector((__v4di)__A, __v4qi), (__v4qi){0, 0, 0, 0}, 0, 1, + 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovqb256_mask ((__v4di) __A, + (__v16qi) __O, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtepi64_epi8 (__mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovqb256_mask ((__v4di) __A, + (__v16qi) _mm_setzero_si128 (), + __M); +} + +static __inline__ void __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A) +{ + __builtin_ia32_pmovqb256mem_mask ((__v16qi *) __P, (__v4di) __A, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_cvtepi64_epi32 (__m128i __A) +{ + return (__m128i)__builtin_shufflevector( + __builtin_convertvector((__v2di)__A, __v2si), (__v2si){0, 0}, 0, 1, 2, 3); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvtepi64_epi32 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovqd128_mask ((__v2di) __A, + (__v4si) __O, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtepi64_epi32 (__mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovqd128_mask ((__v2di) __A, + (__v4si) _mm_setzero_si128 (), + __M); +} + +static __inline__ void __DEFAULT_FN_ATTRS128 +_mm_mask_cvtepi64_storeu_epi32 (void * __P, __mmask8 __M, __m128i __A) +{ + __builtin_ia32_pmovqd128mem_mask ((__v4si *) __P, (__v2di) __A, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_cvtepi64_epi32 (__m256i __A) +{ + return (__m128i)__builtin_convertvector((__v4di)__A, __v4si); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtepi64_epi32 (__m128i __O, __mmask8 __M, __m256i __A) +{ + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, + (__v4si)_mm256_cvtepi64_epi32(__A), + (__v4si)__O); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtepi64_epi32 (__mmask8 __M, __m256i __A) +{ + return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, + (__v4si)_mm256_cvtepi64_epi32(__A), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ void __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtepi64_storeu_epi32 (void * __P, __mmask8 __M, __m256i __A) +{ + __builtin_ia32_pmovqd256mem_mask ((__v4si *) __P, (__v4di) __A, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_cvtepi64_epi16 (__m128i __A) +{ + return (__m128i)__builtin_shufflevector( + __builtin_convertvector((__v2di)__A, __v2hi), (__v2hi){0, 0}, 0, 1, 2, 3, + 3, 3, 3, 3); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovqw128_mask ((__v2di) __A, + (__v8hi)__O, + __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtepi64_epi16 (__mmask8 __M, __m128i __A) +{ + return (__m128i) __builtin_ia32_pmovqw128_mask ((__v2di) __A, + (__v8hi) _mm_setzero_si128 (), + __M); +} + +static __inline__ void __DEFAULT_FN_ATTRS128 +_mm_mask_cvtepi64_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A) +{ + __builtin_ia32_pmovqw128mem_mask ((__v8hi *) __P, (__v2di) __A, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_cvtepi64_epi16 (__m256i __A) +{ + return (__m128i)__builtin_shufflevector( + __builtin_convertvector((__v4di)__A, __v4hi), (__v4hi){0, 0, 0, 0}, 0, 1, + 2, 3, 4, 5, 6, 7); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovqw256_mask ((__v4di) __A, + (__v8hi) __O, __M); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtepi64_epi16 (__mmask8 __M, __m256i __A) +{ + return (__m128i) __builtin_ia32_pmovqw256_mask ((__v4di) __A, + (__v8hi) _mm_setzero_si128 (), + __M); +} + +static __inline__ void __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A) +{ + __builtin_ia32_pmovqw256mem_mask ((__v8hi *) __P, (__v4di) __A, __M); +} + +#define _mm256_extractf32x4_ps(A, imm) \ + ((__m128)__builtin_ia32_extractf32x4_256_mask((__v8sf)(__m256)(A), \ + (int)(imm), \ + (__v4sf)_mm_undefined_ps(), \ + (__mmask8)-1)) + +#define _mm256_mask_extractf32x4_ps(W, U, A, imm) \ + ((__m128)__builtin_ia32_extractf32x4_256_mask((__v8sf)(__m256)(A), \ + (int)(imm), \ + (__v4sf)(__m128)(W), \ + (__mmask8)(U))) + +#define _mm256_maskz_extractf32x4_ps(U, A, imm) \ + ((__m128)__builtin_ia32_extractf32x4_256_mask((__v8sf)(__m256)(A), \ + (int)(imm), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(U))) + +#define _mm256_extracti32x4_epi32(A, imm) \ + ((__m128i)__builtin_ia32_extracti32x4_256_mask((__v8si)(__m256i)(A), \ + (int)(imm), \ + (__v4si)_mm_undefined_si128(), \ + (__mmask8)-1)) + +#define _mm256_mask_extracti32x4_epi32(W, U, A, imm) \ + ((__m128i)__builtin_ia32_extracti32x4_256_mask((__v8si)(__m256i)(A), \ + (int)(imm), \ + (__v4si)(__m128i)(W), \ + (__mmask8)(U))) + +#define _mm256_maskz_extracti32x4_epi32(U, A, imm) \ + ((__m128i)__builtin_ia32_extracti32x4_256_mask((__v8si)(__m256i)(A), \ + (int)(imm), \ + (__v4si)_mm_setzero_si128(), \ + (__mmask8)(U))) + +#define _mm256_insertf32x4(A, B, imm) \ + ((__m256)__builtin_ia32_insertf32x4_256((__v8sf)(__m256)(A), \ + (__v4sf)(__m128)(B), (int)(imm))) + +#define _mm256_mask_insertf32x4(W, U, A, B, imm) \ + ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ + (__v8sf)_mm256_insertf32x4((A), (B), (imm)), \ + (__v8sf)(__m256)(W))) + +#define _mm256_maskz_insertf32x4(U, A, B, imm) \ + ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ + (__v8sf)_mm256_insertf32x4((A), (B), (imm)), \ + (__v8sf)_mm256_setzero_ps())) + +#define _mm256_inserti32x4(A, B, imm) \ + ((__m256i)__builtin_ia32_inserti32x4_256((__v8si)(__m256i)(A), \ + (__v4si)(__m128i)(B), (int)(imm))) + +#define _mm256_mask_inserti32x4(W, U, A, B, imm) \ + ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ + (__v8si)_mm256_inserti32x4((A), (B), (imm)), \ + (__v8si)(__m256i)(W))) + +#define _mm256_maskz_inserti32x4(U, A, B, imm) \ + ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ + (__v8si)_mm256_inserti32x4((A), (B), (imm)), \ + (__v8si)_mm256_setzero_si256())) + +#define _mm_getmant_pd(A, B, C) \ + ((__m128d)__builtin_ia32_getmantpd128_mask((__v2df)(__m128d)(A), \ + (int)(((C)<<2) | (B)), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)-1)) + +#define _mm_mask_getmant_pd(W, U, A, B, C) \ + ((__m128d)__builtin_ia32_getmantpd128_mask((__v2df)(__m128d)(A), \ + (int)(((C)<<2) | (B)), \ + (__v2df)(__m128d)(W), \ + (__mmask8)(U))) + +#define _mm_maskz_getmant_pd(U, A, B, C) \ + ((__m128d)__builtin_ia32_getmantpd128_mask((__v2df)(__m128d)(A), \ + (int)(((C)<<2) | (B)), \ + (__v2df)_mm_setzero_pd(), \ + (__mmask8)(U))) + +#define _mm256_getmant_pd(A, B, C) \ + ((__m256d)__builtin_ia32_getmantpd256_mask((__v4df)(__m256d)(A), \ + (int)(((C)<<2) | (B)), \ + (__v4df)_mm256_setzero_pd(), \ + (__mmask8)-1)) + +#define _mm256_mask_getmant_pd(W, U, A, B, C) \ + ((__m256d)__builtin_ia32_getmantpd256_mask((__v4df)(__m256d)(A), \ + (int)(((C)<<2) | (B)), \ + (__v4df)(__m256d)(W), \ + (__mmask8)(U))) + +#define _mm256_maskz_getmant_pd(U, A, B, C) \ + ((__m256d)__builtin_ia32_getmantpd256_mask((__v4df)(__m256d)(A), \ + (int)(((C)<<2) | (B)), \ + (__v4df)_mm256_setzero_pd(), \ + (__mmask8)(U))) + +#define _mm_getmant_ps(A, B, C) \ + ((__m128)__builtin_ia32_getmantps128_mask((__v4sf)(__m128)(A), \ + (int)(((C)<<2) | (B)), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)-1)) + +#define _mm_mask_getmant_ps(W, U, A, B, C) \ + ((__m128)__builtin_ia32_getmantps128_mask((__v4sf)(__m128)(A), \ + (int)(((C)<<2) | (B)), \ + (__v4sf)(__m128)(W), \ + (__mmask8)(U))) + +#define _mm_maskz_getmant_ps(U, A, B, C) \ + ((__m128)__builtin_ia32_getmantps128_mask((__v4sf)(__m128)(A), \ + (int)(((C)<<2) | (B)), \ + (__v4sf)_mm_setzero_ps(), \ + (__mmask8)(U))) + +#define _mm256_getmant_ps(A, B, C) \ + ((__m256)__builtin_ia32_getmantps256_mask((__v8sf)(__m256)(A), \ + (int)(((C)<<2) | (B)), \ + (__v8sf)_mm256_setzero_ps(), \ + (__mmask8)-1)) + +#define _mm256_mask_getmant_ps(W, U, A, B, C) \ + ((__m256)__builtin_ia32_getmantps256_mask((__v8sf)(__m256)(A), \ + (int)(((C)<<2) | (B)), \ + (__v8sf)(__m256)(W), \ + (__mmask8)(U))) + +#define _mm256_maskz_getmant_ps(U, A, B, C) \ + ((__m256)__builtin_ia32_getmantps256_mask((__v8sf)(__m256)(A), \ + (int)(((C)<<2) | (B)), \ + (__v8sf)_mm256_setzero_ps(), \ + (__mmask8)(U))) + +#define _mm_mmask_i64gather_pd(v1_old, mask, index, addr, scale) \ + ((__m128d)__builtin_ia32_gather3div2df((__v2df)(__m128d)(v1_old), \ + (void const *)(addr), \ + (__v2di)(__m128i)(index), \ + (__mmask8)(mask), (int)(scale))) + +#define _mm_mmask_i64gather_epi64(v1_old, mask, index, addr, scale) \ + ((__m128i)__builtin_ia32_gather3div2di((__v2di)(__m128i)(v1_old), \ + (void const *)(addr), \ + (__v2di)(__m128i)(index), \ + (__mmask8)(mask), (int)(scale))) + +#define _mm256_mmask_i64gather_pd(v1_old, mask, index, addr, scale) \ + ((__m256d)__builtin_ia32_gather3div4df((__v4df)(__m256d)(v1_old), \ + (void const *)(addr), \ + (__v4di)(__m256i)(index), \ + (__mmask8)(mask), (int)(scale))) + +#define _mm256_mmask_i64gather_epi64(v1_old, mask, index, addr, scale) \ + ((__m256i)__builtin_ia32_gather3div4di((__v4di)(__m256i)(v1_old), \ + (void const *)(addr), \ + (__v4di)(__m256i)(index), \ + (__mmask8)(mask), (int)(scale))) + +#define _mm_mmask_i64gather_ps(v1_old, mask, index, addr, scale) \ + ((__m128)__builtin_ia32_gather3div4sf((__v4sf)(__m128)(v1_old), \ + (void const *)(addr), \ + (__v2di)(__m128i)(index), \ + (__mmask8)(mask), (int)(scale))) + +#define _mm_mmask_i64gather_epi32(v1_old, mask, index, addr, scale) \ + ((__m128i)__builtin_ia32_gather3div4si((__v4si)(__m128i)(v1_old), \ + (void const *)(addr), \ + (__v2di)(__m128i)(index), \ + (__mmask8)(mask), (int)(scale))) + +#define _mm256_mmask_i64gather_ps(v1_old, mask, index, addr, scale) \ + ((__m128)__builtin_ia32_gather3div8sf((__v4sf)(__m128)(v1_old), \ + (void const *)(addr), \ + (__v4di)(__m256i)(index), \ + (__mmask8)(mask), (int)(scale))) + +#define _mm256_mmask_i64gather_epi32(v1_old, mask, index, addr, scale) \ + ((__m128i)__builtin_ia32_gather3div8si((__v4si)(__m128i)(v1_old), \ + (void const *)(addr), \ + (__v4di)(__m256i)(index), \ + (__mmask8)(mask), (int)(scale))) + +#define _mm_mmask_i32gather_pd(v1_old, mask, index, addr, scale) \ + ((__m128d)__builtin_ia32_gather3siv2df((__v2df)(__m128d)(v1_old), \ + (void const *)(addr), \ + (__v4si)(__m128i)(index), \ + (__mmask8)(mask), (int)(scale))) + +#define _mm_mmask_i32gather_epi64(v1_old, mask, index, addr, scale) \ + ((__m128i)__builtin_ia32_gather3siv2di((__v2di)(__m128i)(v1_old), \ + (void const *)(addr), \ + (__v4si)(__m128i)(index), \ + (__mmask8)(mask), (int)(scale))) + +#define _mm256_mmask_i32gather_pd(v1_old, mask, index, addr, scale) \ + ((__m256d)__builtin_ia32_gather3siv4df((__v4df)(__m256d)(v1_old), \ + (void const *)(addr), \ + (__v4si)(__m128i)(index), \ + (__mmask8)(mask), (int)(scale))) + +#define _mm256_mmask_i32gather_epi64(v1_old, mask, index, addr, scale) \ + ((__m256i)__builtin_ia32_gather3siv4di((__v4di)(__m256i)(v1_old), \ + (void const *)(addr), \ + (__v4si)(__m128i)(index), \ + (__mmask8)(mask), (int)(scale))) + +#define _mm_mmask_i32gather_ps(v1_old, mask, index, addr, scale) \ + ((__m128)__builtin_ia32_gather3siv4sf((__v4sf)(__m128)(v1_old), \ + (void const *)(addr), \ + (__v4si)(__m128i)(index), \ + (__mmask8)(mask), (int)(scale))) + +#define _mm_mmask_i32gather_epi32(v1_old, mask, index, addr, scale) \ + ((__m128i)__builtin_ia32_gather3siv4si((__v4si)(__m128i)(v1_old), \ + (void const *)(addr), \ + (__v4si)(__m128i)(index), \ + (__mmask8)(mask), (int)(scale))) + +#define _mm256_mmask_i32gather_ps(v1_old, mask, index, addr, scale) \ + ((__m256)__builtin_ia32_gather3siv8sf((__v8sf)(__m256)(v1_old), \ + (void const *)(addr), \ + (__v8si)(__m256i)(index), \ + (__mmask8)(mask), (int)(scale))) + +#define _mm256_mmask_i32gather_epi32(v1_old, mask, index, addr, scale) \ + ((__m256i)__builtin_ia32_gather3siv8si((__v8si)(__m256i)(v1_old), \ + (void const *)(addr), \ + (__v8si)(__m256i)(index), \ + (__mmask8)(mask), (int)(scale))) + +#define _mm256_permutex_pd(X, C) \ + ((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(X), (int)(C))) + +#define _mm256_mask_permutex_pd(W, U, X, C) \ + ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ + (__v4df)_mm256_permutex_pd((X), (C)), \ + (__v4df)(__m256d)(W))) + +#define _mm256_maskz_permutex_pd(U, X, C) \ + ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ + (__v4df)_mm256_permutex_pd((X), (C)), \ + (__v4df)_mm256_setzero_pd())) + +#define _mm256_permutex_epi64(X, C) \ + ((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(X), (int)(C))) + +#define _mm256_mask_permutex_epi64(W, U, X, C) \ + ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ + (__v4di)_mm256_permutex_epi64((X), (C)), \ + (__v4di)(__m256i)(W))) + +#define _mm256_maskz_permutex_epi64(U, X, C) \ + ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ + (__v4di)_mm256_permutex_epi64((X), (C)), \ + (__v4di)_mm256_setzero_si256())) + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_permutexvar_pd (__m256i __X, __m256d __Y) +{ + return (__m256d)__builtin_ia32_permvardf256((__v4df)__Y, (__v4di)__X); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_mask_permutexvar_pd (__m256d __W, __mmask8 __U, __m256i __X, + __m256d __Y) +{ + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_permutexvar_pd(__X, __Y), + (__v4df)__W); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_maskz_permutexvar_pd (__mmask8 __U, __m256i __X, __m256d __Y) +{ + return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, + (__v4df)_mm256_permutexvar_pd(__X, __Y), + (__v4df)_mm256_setzero_pd()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_permutexvar_epi64 ( __m256i __X, __m256i __Y) +{ + return (__m256i)__builtin_ia32_permvardi256((__v4di) __Y, (__v4di) __X); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_permutexvar_epi64 (__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, + (__v4di)_mm256_permutexvar_epi64(__X, __Y), + (__v4di)_mm256_setzero_si256()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_permutexvar_epi64 (__m256i __W, __mmask8 __M, __m256i __X, + __m256i __Y) +{ + return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, + (__v4di)_mm256_permutexvar_epi64(__X, __Y), + (__v4di)__W); +} + +#define _mm256_permutexvar_ps(A, B) _mm256_permutevar8x32_ps((B), (A)) + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_mask_permutexvar_ps(__m256 __W, __mmask8 __U, __m256i __X, __m256 __Y) +{ + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_permutexvar_ps(__X, __Y), + (__v8sf)__W); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_maskz_permutexvar_ps(__mmask8 __U, __m256i __X, __m256 __Y) +{ + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_permutexvar_ps(__X, __Y), + (__v8sf)_mm256_setzero_ps()); +} + +#define _mm256_permutexvar_epi32(A, B) _mm256_permutevar8x32_epi32((B), (A)) + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_permutexvar_epi32(__m256i __W, __mmask8 __M, __m256i __X, + __m256i __Y) +{ + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, + (__v8si)_mm256_permutexvar_epi32(__X, __Y), + (__v8si)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_permutexvar_epi32(__mmask8 __M, __m256i __X, __m256i __Y) +{ + return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, + (__v8si)_mm256_permutexvar_epi32(__X, __Y), + (__v8si)_mm256_setzero_si256()); +} + +#define _mm_alignr_epi32(A, B, imm) \ + ((__m128i)__builtin_ia32_alignd128((__v4si)(__m128i)(A), \ + (__v4si)(__m128i)(B), (int)(imm))) + +#define _mm_mask_alignr_epi32(W, U, A, B, imm) \ + ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ + (__v4si)_mm_alignr_epi32((A), (B), (imm)), \ + (__v4si)(__m128i)(W))) + +#define _mm_maskz_alignr_epi32(U, A, B, imm) \ + ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ + (__v4si)_mm_alignr_epi32((A), (B), (imm)), \ + (__v4si)_mm_setzero_si128())) + +#define _mm256_alignr_epi32(A, B, imm) \ + ((__m256i)__builtin_ia32_alignd256((__v8si)(__m256i)(A), \ + (__v8si)(__m256i)(B), (int)(imm))) + +#define _mm256_mask_alignr_epi32(W, U, A, B, imm) \ + ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ + (__v8si)_mm256_alignr_epi32((A), (B), (imm)), \ + (__v8si)(__m256i)(W))) + +#define _mm256_maskz_alignr_epi32(U, A, B, imm) \ + ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ + (__v8si)_mm256_alignr_epi32((A), (B), (imm)), \ + (__v8si)_mm256_setzero_si256())) + +#define _mm_alignr_epi64(A, B, imm) \ + ((__m128i)__builtin_ia32_alignq128((__v2di)(__m128i)(A), \ + (__v2di)(__m128i)(B), (int)(imm))) + +#define _mm_mask_alignr_epi64(W, U, A, B, imm) \ + ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \ + (__v2di)_mm_alignr_epi64((A), (B), (imm)), \ + (__v2di)(__m128i)(W))) + +#define _mm_maskz_alignr_epi64(U, A, B, imm) \ + ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \ + (__v2di)_mm_alignr_epi64((A), (B), (imm)), \ + (__v2di)_mm_setzero_si128())) + +#define _mm256_alignr_epi64(A, B, imm) \ + ((__m256i)__builtin_ia32_alignq256((__v4di)(__m256i)(A), \ + (__v4di)(__m256i)(B), (int)(imm))) + +#define _mm256_mask_alignr_epi64(W, U, A, B, imm) \ + ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ + (__v4di)_mm256_alignr_epi64((A), (B), (imm)), \ + (__v4di)(__m256i)(W))) + +#define _mm256_maskz_alignr_epi64(U, A, B, imm) \ + ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ + (__v4di)_mm256_alignr_epi64((A), (B), (imm)), \ + (__v4di)_mm256_setzero_si256())) + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask_movehdup_ps (__m128 __W, __mmask8 __U, __m128 __A) +{ + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_movehdup_ps(__A), + (__v4sf)__W); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_movehdup_ps (__mmask8 __U, __m128 __A) +{ + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_movehdup_ps(__A), + (__v4sf)_mm_setzero_ps()); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_mask_movehdup_ps (__m256 __W, __mmask8 __U, __m256 __A) +{ + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_movehdup_ps(__A), + (__v8sf)__W); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_maskz_movehdup_ps (__mmask8 __U, __m256 __A) +{ + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_movehdup_ps(__A), + (__v8sf)_mm256_setzero_ps()); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask_moveldup_ps (__m128 __W, __mmask8 __U, __m128 __A) +{ + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_moveldup_ps(__A), + (__v4sf)__W); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_moveldup_ps (__mmask8 __U, __m128 __A) +{ + return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, + (__v4sf)_mm_moveldup_ps(__A), + (__v4sf)_mm_setzero_ps()); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_mask_moveldup_ps (__m256 __W, __mmask8 __U, __m256 __A) +{ + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_moveldup_ps(__A), + (__v8sf)__W); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_maskz_moveldup_ps (__mmask8 __U, __m256 __A) +{ + return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, + (__v8sf)_mm256_moveldup_ps(__A), + (__v8sf)_mm256_setzero_ps()); +} + +#define _mm256_mask_shuffle_epi32(W, U, A, I) \ + ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ + (__v8si)_mm256_shuffle_epi32((A), (I)), \ + (__v8si)(__m256i)(W))) + +#define _mm256_maskz_shuffle_epi32(U, A, I) \ + ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ + (__v8si)_mm256_shuffle_epi32((A), (I)), \ + (__v8si)_mm256_setzero_si256())) + +#define _mm_mask_shuffle_epi32(W, U, A, I) \ + ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ + (__v4si)_mm_shuffle_epi32((A), (I)), \ + (__v4si)(__m128i)(W))) + +#define _mm_maskz_shuffle_epi32(U, A, I) \ + ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ + (__v4si)_mm_shuffle_epi32((A), (I)), \ + (__v4si)_mm_setzero_si128())) + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_mask_mov_pd (__m128d __W, __mmask8 __U, __m128d __A) +{ + return (__m128d) __builtin_ia32_selectpd_128 ((__mmask8) __U, + (__v2df) __A, + (__v2df) __W); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_maskz_mov_pd (__mmask8 __U, __m128d __A) +{ + return (__m128d) __builtin_ia32_selectpd_128 ((__mmask8) __U, + (__v2df) __A, + (__v2df) _mm_setzero_pd ()); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_mask_mov_pd (__m256d __W, __mmask8 __U, __m256d __A) +{ + return (__m256d) __builtin_ia32_selectpd_256 ((__mmask8) __U, + (__v4df) __A, + (__v4df) __W); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_maskz_mov_pd (__mmask8 __U, __m256d __A) +{ + return (__m256d) __builtin_ia32_selectpd_256 ((__mmask8) __U, + (__v4df) __A, + (__v4df) _mm256_setzero_pd ()); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask_mov_ps (__m128 __W, __mmask8 __U, __m128 __A) +{ + return (__m128) __builtin_ia32_selectps_128 ((__mmask8) __U, + (__v4sf) __A, + (__v4sf) __W); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_mov_ps (__mmask8 __U, __m128 __A) +{ + return (__m128) __builtin_ia32_selectps_128 ((__mmask8) __U, + (__v4sf) __A, + (__v4sf) _mm_setzero_ps ()); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_mask_mov_ps (__m256 __W, __mmask8 __U, __m256 __A) +{ + return (__m256) __builtin_ia32_selectps_256 ((__mmask8) __U, + (__v8sf) __A, + (__v8sf) __W); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_maskz_mov_ps (__mmask8 __U, __m256 __A) +{ + return (__m256) __builtin_ia32_selectps_256 ((__mmask8) __U, + (__v8sf) __A, + (__v8sf) _mm256_setzero_ps ()); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_mask_cvtph_ps (__m128 __W, __mmask8 __U, __m128i __A) +{ + return (__m128) __builtin_ia32_vcvtph2ps_mask ((__v8hi) __A, + (__v4sf) __W, + (__mmask8) __U); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maskz_cvtph_ps (__mmask8 __U, __m128i __A) +{ + return (__m128) __builtin_ia32_vcvtph2ps_mask ((__v8hi) __A, + (__v4sf) + _mm_setzero_ps (), + (__mmask8) __U); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_mask_cvtph_ps (__m256 __W, __mmask8 __U, __m128i __A) +{ + return (__m256) __builtin_ia32_vcvtph2ps256_mask ((__v8hi) __A, + (__v8sf) __W, + (__mmask8) __U); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_maskz_cvtph_ps (__mmask8 __U, __m128i __A) +{ + return (__m256) __builtin_ia32_vcvtph2ps256_mask ((__v8hi) __A, + (__v8sf) + _mm256_setzero_ps (), + (__mmask8) __U); +} + +#define _mm_mask_cvt_roundps_ph(W, U, A, I) \ + ((__m128i)__builtin_ia32_vcvtps2ph_mask((__v4sf)(__m128)(A), (int)(I), \ + (__v8hi)(__m128i)(W), \ + (__mmask8)(U))) + +#define _mm_maskz_cvt_roundps_ph(U, A, I) \ + ((__m128i)__builtin_ia32_vcvtps2ph_mask((__v4sf)(__m128)(A), (int)(I), \ + (__v8hi)_mm_setzero_si128(), \ + (__mmask8)(U))) + +#define _mm_mask_cvtps_ph _mm_mask_cvt_roundps_ph +#define _mm_maskz_cvtps_ph _mm_maskz_cvt_roundps_ph + +#define _mm256_mask_cvt_roundps_ph(W, U, A, I) \ + ((__m128i)__builtin_ia32_vcvtps2ph256_mask((__v8sf)(__m256)(A), (int)(I), \ + (__v8hi)(__m128i)(W), \ + (__mmask8)(U))) + +#define _mm256_maskz_cvt_roundps_ph(U, A, I) \ + ((__m128i)__builtin_ia32_vcvtps2ph256_mask((__v8sf)(__m256)(A), (int)(I), \ + (__v8hi)_mm_setzero_si128(), \ + (__mmask8)(U))) + +#define _mm256_mask_cvtps_ph _mm256_mask_cvt_roundps_ph +#define _mm256_maskz_cvtps_ph _mm256_maskz_cvt_roundps_ph + + +#undef __DEFAULT_FN_ATTRS128 +#undef __DEFAULT_FN_ATTRS256 + +#endif /* __AVX512VLINTRIN_H */ diff --git a/include-llvm/avx512vlvbmi2intrin.h b/include-llvm/avx512vlvbmi2intrin.h new file mode 100644 index 0000000..fac1f23 --- /dev/null +++ b/include-llvm/avx512vlvbmi2intrin.h @@ -0,0 +1,689 @@ +/*===------------- avx512vlvbmi2intrin.h - VBMI2 intrinsics -----------------=== + * + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __AVX512VLVBMI2INTRIN_H +#define __AVX512VLVBMI2INTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vbmi2"), __min_vector_width__(128))) +#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vbmi2"), __min_vector_width__(256))) + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_compress_epi16(__m128i __S, __mmask8 __U, __m128i __D) +{ + return (__m128i) __builtin_ia32_compresshi128_mask ((__v8hi) __D, + (__v8hi) __S, + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_compress_epi16(__mmask8 __U, __m128i __D) +{ + return (__m128i) __builtin_ia32_compresshi128_mask ((__v8hi) __D, + (__v8hi) _mm_setzero_si128(), + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_compress_epi8(__m128i __S, __mmask16 __U, __m128i __D) +{ + return (__m128i) __builtin_ia32_compressqi128_mask ((__v16qi) __D, + (__v16qi) __S, + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_compress_epi8(__mmask16 __U, __m128i __D) +{ + return (__m128i) __builtin_ia32_compressqi128_mask ((__v16qi) __D, + (__v16qi) _mm_setzero_si128(), + __U); +} + +static __inline__ void __DEFAULT_FN_ATTRS128 +_mm_mask_compressstoreu_epi16(void *__P, __mmask8 __U, __m128i __D) +{ + __builtin_ia32_compressstorehi128_mask ((__v8hi *) __P, (__v8hi) __D, + __U); +} + +static __inline__ void __DEFAULT_FN_ATTRS128 +_mm_mask_compressstoreu_epi8(void *__P, __mmask16 __U, __m128i __D) +{ + __builtin_ia32_compressstoreqi128_mask ((__v16qi *) __P, (__v16qi) __D, + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_expand_epi16(__m128i __S, __mmask8 __U, __m128i __D) +{ + return (__m128i) __builtin_ia32_expandhi128_mask ((__v8hi) __D, + (__v8hi) __S, + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_expand_epi16(__mmask8 __U, __m128i __D) +{ + return (__m128i) __builtin_ia32_expandhi128_mask ((__v8hi) __D, + (__v8hi) _mm_setzero_si128(), + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_expand_epi8(__m128i __S, __mmask16 __U, __m128i __D) +{ + return (__m128i) __builtin_ia32_expandqi128_mask ((__v16qi) __D, + (__v16qi) __S, + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_expand_epi8(__mmask16 __U, __m128i __D) +{ + return (__m128i) __builtin_ia32_expandqi128_mask ((__v16qi) __D, + (__v16qi) _mm_setzero_si128(), + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_expandloadu_epi16(__m128i __S, __mmask8 __U, void const *__P) +{ + return (__m128i) __builtin_ia32_expandloadhi128_mask ((const __v8hi *)__P, + (__v8hi) __S, + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_expandloadu_epi16(__mmask8 __U, void const *__P) +{ + return (__m128i) __builtin_ia32_expandloadhi128_mask ((const __v8hi *)__P, + (__v8hi) _mm_setzero_si128(), + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_expandloadu_epi8(__m128i __S, __mmask16 __U, void const *__P) +{ + return (__m128i) __builtin_ia32_expandloadqi128_mask ((const __v16qi *)__P, + (__v16qi) __S, + __U); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_expandloadu_epi8(__mmask16 __U, void const *__P) +{ + return (__m128i) __builtin_ia32_expandloadqi128_mask ((const __v16qi *)__P, + (__v16qi) _mm_setzero_si128(), + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_compress_epi16(__m256i __S, __mmask16 __U, __m256i __D) +{ + return (__m256i) __builtin_ia32_compresshi256_mask ((__v16hi) __D, + (__v16hi) __S, + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_compress_epi16(__mmask16 __U, __m256i __D) +{ + return (__m256i) __builtin_ia32_compresshi256_mask ((__v16hi) __D, + (__v16hi) _mm256_setzero_si256(), + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_compress_epi8(__m256i __S, __mmask32 __U, __m256i __D) +{ + return (__m256i) __builtin_ia32_compressqi256_mask ((__v32qi) __D, + (__v32qi) __S, + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_compress_epi8(__mmask32 __U, __m256i __D) +{ + return (__m256i) __builtin_ia32_compressqi256_mask ((__v32qi) __D, + (__v32qi) _mm256_setzero_si256(), + __U); +} + +static __inline__ void __DEFAULT_FN_ATTRS256 +_mm256_mask_compressstoreu_epi16(void *__P, __mmask16 __U, __m256i __D) +{ + __builtin_ia32_compressstorehi256_mask ((__v16hi *) __P, (__v16hi) __D, + __U); +} + +static __inline__ void __DEFAULT_FN_ATTRS256 +_mm256_mask_compressstoreu_epi8(void *__P, __mmask32 __U, __m256i __D) +{ + __builtin_ia32_compressstoreqi256_mask ((__v32qi *) __P, (__v32qi) __D, + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_expand_epi16(__m256i __S, __mmask16 __U, __m256i __D) +{ + return (__m256i) __builtin_ia32_expandhi256_mask ((__v16hi) __D, + (__v16hi) __S, + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_expand_epi16(__mmask16 __U, __m256i __D) +{ + return (__m256i) __builtin_ia32_expandhi256_mask ((__v16hi) __D, + (__v16hi) _mm256_setzero_si256(), + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_expand_epi8(__m256i __S, __mmask32 __U, __m256i __D) +{ + return (__m256i) __builtin_ia32_expandqi256_mask ((__v32qi) __D, + (__v32qi) __S, + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_expand_epi8(__mmask32 __U, __m256i __D) +{ + return (__m256i) __builtin_ia32_expandqi256_mask ((__v32qi) __D, + (__v32qi) _mm256_setzero_si256(), + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_expandloadu_epi16(__m256i __S, __mmask16 __U, void const *__P) +{ + return (__m256i) __builtin_ia32_expandloadhi256_mask ((const __v16hi *)__P, + (__v16hi) __S, + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_expandloadu_epi16(__mmask16 __U, void const *__P) +{ + return (__m256i) __builtin_ia32_expandloadhi256_mask ((const __v16hi *)__P, + (__v16hi) _mm256_setzero_si256(), + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_expandloadu_epi8(__m256i __S, __mmask32 __U, void const *__P) +{ + return (__m256i) __builtin_ia32_expandloadqi256_mask ((const __v32qi *)__P, + (__v32qi) __S, + __U); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_expandloadu_epi8(__mmask32 __U, void const *__P) +{ + return (__m256i) __builtin_ia32_expandloadqi256_mask ((const __v32qi *)__P, + (__v32qi) _mm256_setzero_si256(), + __U); +} + +#define _mm256_shldi_epi64(A, B, I) \ + ((__m256i)__builtin_ia32_vpshldq256((__v4di)(__m256i)(A), \ + (__v4di)(__m256i)(B), (int)(I))) + +#define _mm256_mask_shldi_epi64(S, U, A, B, I) \ + ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ + (__v4di)_mm256_shldi_epi64((A), (B), (I)), \ + (__v4di)(__m256i)(S))) + +#define _mm256_maskz_shldi_epi64(U, A, B, I) \ + ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ + (__v4di)_mm256_shldi_epi64((A), (B), (I)), \ + (__v4di)_mm256_setzero_si256())) + +#define _mm_shldi_epi64(A, B, I) \ + ((__m128i)__builtin_ia32_vpshldq128((__v2di)(__m128i)(A), \ + (__v2di)(__m128i)(B), (int)(I))) + +#define _mm_mask_shldi_epi64(S, U, A, B, I) \ + ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \ + (__v2di)_mm_shldi_epi64((A), (B), (I)), \ + (__v2di)(__m128i)(S))) + +#define _mm_maskz_shldi_epi64(U, A, B, I) \ + ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \ + (__v2di)_mm_shldi_epi64((A), (B), (I)), \ + (__v2di)_mm_setzero_si128())) + +#define _mm256_shldi_epi32(A, B, I) \ + ((__m256i)__builtin_ia32_vpshldd256((__v8si)(__m256i)(A), \ + (__v8si)(__m256i)(B), (int)(I))) + +#define _mm256_mask_shldi_epi32(S, U, A, B, I) \ + ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ + (__v8si)_mm256_shldi_epi32((A), (B), (I)), \ + (__v8si)(__m256i)(S))) + +#define _mm256_maskz_shldi_epi32(U, A, B, I) \ + ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ + (__v8si)_mm256_shldi_epi32((A), (B), (I)), \ + (__v8si)_mm256_setzero_si256())) + +#define _mm_shldi_epi32(A, B, I) \ + ((__m128i)__builtin_ia32_vpshldd128((__v4si)(__m128i)(A), \ + (__v4si)(__m128i)(B), (int)(I))) + +#define _mm_mask_shldi_epi32(S, U, A, B, I) \ + ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ + (__v4si)_mm_shldi_epi32((A), (B), (I)), \ + (__v4si)(__m128i)(S))) + +#define _mm_maskz_shldi_epi32(U, A, B, I) \ + ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ + (__v4si)_mm_shldi_epi32((A), (B), (I)), \ + (__v4si)_mm_setzero_si128())) + +#define _mm256_shldi_epi16(A, B, I) \ + ((__m256i)__builtin_ia32_vpshldw256((__v16hi)(__m256i)(A), \ + (__v16hi)(__m256i)(B), (int)(I))) + +#define _mm256_mask_shldi_epi16(S, U, A, B, I) \ + ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ + (__v16hi)_mm256_shldi_epi16((A), (B), (I)), \ + (__v16hi)(__m256i)(S))) + +#define _mm256_maskz_shldi_epi16(U, A, B, I) \ + ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ + (__v16hi)_mm256_shldi_epi16((A), (B), (I)), \ + (__v16hi)_mm256_setzero_si256())) + +#define _mm_shldi_epi16(A, B, I) \ + ((__m128i)__builtin_ia32_vpshldw128((__v8hi)(__m128i)(A), \ + (__v8hi)(__m128i)(B), (int)(I))) + +#define _mm_mask_shldi_epi16(S, U, A, B, I) \ + ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ + (__v8hi)_mm_shldi_epi16((A), (B), (I)), \ + (__v8hi)(__m128i)(S))) + +#define _mm_maskz_shldi_epi16(U, A, B, I) \ + ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ + (__v8hi)_mm_shldi_epi16((A), (B), (I)), \ + (__v8hi)_mm_setzero_si128())) + +#define _mm256_shrdi_epi64(A, B, I) \ + ((__m256i)__builtin_ia32_vpshrdq256((__v4di)(__m256i)(A), \ + (__v4di)(__m256i)(B), (int)(I))) + +#define _mm256_mask_shrdi_epi64(S, U, A, B, I) \ + ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ + (__v4di)_mm256_shrdi_epi64((A), (B), (I)), \ + (__v4di)(__m256i)(S))) + +#define _mm256_maskz_shrdi_epi64(U, A, B, I) \ + ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ + (__v4di)_mm256_shrdi_epi64((A), (B), (I)), \ + (__v4di)_mm256_setzero_si256())) + +#define _mm_shrdi_epi64(A, B, I) \ + ((__m128i)__builtin_ia32_vpshrdq128((__v2di)(__m128i)(A), \ + (__v2di)(__m128i)(B), (int)(I))) + +#define _mm_mask_shrdi_epi64(S, U, A, B, I) \ + ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \ + (__v2di)_mm_shrdi_epi64((A), (B), (I)), \ + (__v2di)(__m128i)(S))) + +#define _mm_maskz_shrdi_epi64(U, A, B, I) \ + ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \ + (__v2di)_mm_shrdi_epi64((A), (B), (I)), \ + (__v2di)_mm_setzero_si128())) + +#define _mm256_shrdi_epi32(A, B, I) \ + ((__m256i)__builtin_ia32_vpshrdd256((__v8si)(__m256i)(A), \ + (__v8si)(__m256i)(B), (int)(I))) + +#define _mm256_mask_shrdi_epi32(S, U, A, B, I) \ + ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ + (__v8si)_mm256_shrdi_epi32((A), (B), (I)), \ + (__v8si)(__m256i)(S))) + +#define _mm256_maskz_shrdi_epi32(U, A, B, I) \ + ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ + (__v8si)_mm256_shrdi_epi32((A), (B), (I)), \ + (__v8si)_mm256_setzero_si256())) + +#define _mm_shrdi_epi32(A, B, I) \ + ((__m128i)__builtin_ia32_vpshrdd128((__v4si)(__m128i)(A), \ + (__v4si)(__m128i)(B), (int)(I))) + +#define _mm_mask_shrdi_epi32(S, U, A, B, I) \ + ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ + (__v4si)_mm_shrdi_epi32((A), (B), (I)), \ + (__v4si)(__m128i)(S))) + +#define _mm_maskz_shrdi_epi32(U, A, B, I) \ + ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ + (__v4si)_mm_shrdi_epi32((A), (B), (I)), \ + (__v4si)_mm_setzero_si128())) + +#define _mm256_shrdi_epi16(A, B, I) \ + ((__m256i)__builtin_ia32_vpshrdw256((__v16hi)(__m256i)(A), \ + (__v16hi)(__m256i)(B), (int)(I))) + +#define _mm256_mask_shrdi_epi16(S, U, A, B, I) \ + ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ + (__v16hi)_mm256_shrdi_epi16((A), (B), (I)), \ + (__v16hi)(__m256i)(S))) + +#define _mm256_maskz_shrdi_epi16(U, A, B, I) \ + ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ + (__v16hi)_mm256_shrdi_epi16((A), (B), (I)), \ + (__v16hi)_mm256_setzero_si256())) + +#define _mm_shrdi_epi16(A, B, I) \ + ((__m128i)__builtin_ia32_vpshrdw128((__v8hi)(__m128i)(A), \ + (__v8hi)(__m128i)(B), (int)(I))) + +#define _mm_mask_shrdi_epi16(S, U, A, B, I) \ + ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ + (__v8hi)_mm_shrdi_epi16((A), (B), (I)), \ + (__v8hi)(__m128i)(S))) + +#define _mm_maskz_shrdi_epi16(U, A, B, I) \ + ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ + (__v8hi)_mm_shrdi_epi16((A), (B), (I)), \ + (__v8hi)_mm_setzero_si128())) + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_shldv_epi64(__m256i __A, __m256i __B, __m256i __C) +{ + return (__m256i)__builtin_ia32_vpshldvq256((__v4di)__A, (__v4di)__B, + (__v4di)__C); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_shldv_epi64(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) +{ + return (__m256i)__builtin_ia32_selectq_256(__U, + (__v4di)_mm256_shldv_epi64(__A, __B, __C), + (__v4di)__A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_shldv_epi64(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) +{ + return (__m256i)__builtin_ia32_selectq_256(__U, + (__v4di)_mm256_shldv_epi64(__A, __B, __C), + (__v4di)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_shldv_epi64(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i)__builtin_ia32_vpshldvq128((__v2di)__A, (__v2di)__B, + (__v2di)__C); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_shldv_epi64(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) +{ + return (__m128i)__builtin_ia32_selectq_128(__U, + (__v2di)_mm_shldv_epi64(__A, __B, __C), + (__v2di)__A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_shldv_epi64(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i)__builtin_ia32_selectq_128(__U, + (__v2di)_mm_shldv_epi64(__A, __B, __C), + (__v2di)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_shldv_epi32(__m256i __A, __m256i __B, __m256i __C) +{ + return (__m256i)__builtin_ia32_vpshldvd256((__v8si)__A, (__v8si)__B, + (__v8si)__C); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_shldv_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) +{ + return (__m256i)__builtin_ia32_selectd_256(__U, + (__v8si)_mm256_shldv_epi32(__A, __B, __C), + (__v8si)__A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_shldv_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) +{ + return (__m256i)__builtin_ia32_selectd_256(__U, + (__v8si)_mm256_shldv_epi32(__A, __B, __C), + (__v8si)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_shldv_epi32(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i)__builtin_ia32_vpshldvd128((__v4si)__A, (__v4si)__B, + (__v4si)__C); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_shldv_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) +{ + return (__m128i)__builtin_ia32_selectd_128(__U, + (__v4si)_mm_shldv_epi32(__A, __B, __C), + (__v4si)__A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_shldv_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i)__builtin_ia32_selectd_128(__U, + (__v4si)_mm_shldv_epi32(__A, __B, __C), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_shldv_epi16(__m256i __A, __m256i __B, __m256i __C) +{ + return (__m256i)__builtin_ia32_vpshldvw256((__v16hi)__A, (__v16hi)__B, + (__v16hi)__C); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_shldv_epi16(__m256i __A, __mmask16 __U, __m256i __B, __m256i __C) +{ + return (__m256i)__builtin_ia32_selectw_256(__U, + (__v16hi)_mm256_shldv_epi16(__A, __B, __C), + (__v16hi)__A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_shldv_epi16(__mmask16 __U, __m256i __A, __m256i __B, __m256i __C) +{ + return (__m256i)__builtin_ia32_selectw_256(__U, + (__v16hi)_mm256_shldv_epi16(__A, __B, __C), + (__v16hi)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_shldv_epi16(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i)__builtin_ia32_vpshldvw128((__v8hi)__A, (__v8hi)__B, + (__v8hi)__C); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_shldv_epi16(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) +{ + return (__m128i)__builtin_ia32_selectw_128(__U, + (__v8hi)_mm_shldv_epi16(__A, __B, __C), + (__v8hi)__A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_shldv_epi16(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i)__builtin_ia32_selectw_128(__U, + (__v8hi)_mm_shldv_epi16(__A, __B, __C), + (__v8hi)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_shrdv_epi64(__m256i __A, __m256i __B, __m256i __C) +{ + return (__m256i)__builtin_ia32_vpshrdvq256((__v4di)__A, (__v4di)__B, + (__v4di)__C); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_shrdv_epi64(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) +{ + return (__m256i)__builtin_ia32_selectq_256(__U, + (__v4di)_mm256_shrdv_epi64(__A, __B, __C), + (__v4di)__A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_shrdv_epi64(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) +{ + return (__m256i)__builtin_ia32_selectq_256(__U, + (__v4di)_mm256_shrdv_epi64(__A, __B, __C), + (__v4di)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_shrdv_epi64(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i)__builtin_ia32_vpshrdvq128((__v2di)__A, (__v2di)__B, + (__v2di)__C); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_shrdv_epi64(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) +{ + return (__m128i)__builtin_ia32_selectq_128(__U, + (__v2di)_mm_shrdv_epi64(__A, __B, __C), + (__v2di)__A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_shrdv_epi64(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i)__builtin_ia32_selectq_128(__U, + (__v2di)_mm_shrdv_epi64(__A, __B, __C), + (__v2di)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_shrdv_epi32(__m256i __A, __m256i __B, __m256i __C) +{ + return (__m256i)__builtin_ia32_vpshrdvd256((__v8si)__A, (__v8si)__B, + (__v8si)__C); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_shrdv_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) +{ + return (__m256i)__builtin_ia32_selectd_256(__U, + (__v8si)_mm256_shrdv_epi32(__A, __B, __C), + (__v8si)__A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_shrdv_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) +{ + return (__m256i)__builtin_ia32_selectd_256(__U, + (__v8si)_mm256_shrdv_epi32(__A, __B, __C), + (__v8si)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_shrdv_epi32(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i)__builtin_ia32_vpshrdvd128((__v4si)__A, (__v4si)__B, + (__v4si)__C); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_shrdv_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) +{ + return (__m128i)__builtin_ia32_selectd_128(__U, + (__v4si)_mm_shrdv_epi32(__A, __B, __C), + (__v4si)__A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_shrdv_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i)__builtin_ia32_selectd_128(__U, + (__v4si)_mm_shrdv_epi32(__A, __B, __C), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_shrdv_epi16(__m256i __A, __m256i __B, __m256i __C) +{ + return (__m256i)__builtin_ia32_vpshrdvw256((__v16hi)__A, (__v16hi)__B, + (__v16hi)__C); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_shrdv_epi16(__m256i __A, __mmask16 __U, __m256i __B, __m256i __C) +{ + return (__m256i)__builtin_ia32_selectw_256(__U, + (__v16hi)_mm256_shrdv_epi16(__A, __B, __C), + (__v16hi)__A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_shrdv_epi16(__mmask16 __U, __m256i __A, __m256i __B, __m256i __C) +{ + return (__m256i)__builtin_ia32_selectw_256(__U, + (__v16hi)_mm256_shrdv_epi16(__A, __B, __C), + (__v16hi)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_shrdv_epi16(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i)__builtin_ia32_vpshrdvw128((__v8hi)__A, (__v8hi)__B, + (__v8hi)__C); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_shrdv_epi16(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) +{ + return (__m128i)__builtin_ia32_selectw_128(__U, + (__v8hi)_mm_shrdv_epi16(__A, __B, __C), + (__v8hi)__A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_shrdv_epi16(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i)__builtin_ia32_selectw_128(__U, + (__v8hi)_mm_shrdv_epi16(__A, __B, __C), + (__v8hi)_mm_setzero_si128()); +} + + +#undef __DEFAULT_FN_ATTRS128 +#undef __DEFAULT_FN_ATTRS256 + +#endif diff --git a/include-llvm/avx512vlvnniintrin.h b/include-llvm/avx512vlvnniintrin.h new file mode 100644 index 0000000..0fb29af --- /dev/null +++ b/include-llvm/avx512vlvnniintrin.h @@ -0,0 +1,304 @@ +/*===------------- avx512vlvnniintrin.h - VNNI intrinsics ------------------=== + * + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __AVX512VLVNNIINTRIN_H +#define __AVX512VLVNNIINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"), __min_vector_width__(128))) +#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"), __min_vector_width__(256))) + +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with +/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed +/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer +/// in \a S, and store the packed 32-bit results in DST. +/// +/// This intrinsic corresponds to the VPDPBUSD instructions. +/// +/// \operation +/// FOR j := 0 to 7 +/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j])) +/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1])) +/// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2])) +/// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3])) +/// DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 +/// ENDFOR +/// DST[MAX:256] := 0 +/// \endoperation +#define _mm256_dpbusd_epi32(S, A, B) \ + ((__m256i)__builtin_ia32_vpdpbusd256((__v8si)(S), (__v8si)(A), (__v8si)(B))) + +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with +/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed +/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer +/// in \a S using signed saturation, and store the packed 32-bit results in DST. +/// +/// This intrinsic corresponds to the VPDPBUSDS instructions. +/// +/// \operation +/// FOR j := 0 to 7 +/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j])) +/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1])) +/// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2])) +/// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3])) +/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) +/// ENDFOR +/// DST[MAX:256] := 0 +/// \endoperation +#define _mm256_dpbusds_epi32(S, A, B) \ + ((__m256i)__builtin_ia32_vpdpbusds256((__v8si)(S), (__v8si)(A), (__v8si)(B))) + +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with +/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit +/// results. Sum these 2 results with the corresponding 32-bit integer in \a S, +/// and store the packed 32-bit results in DST. +/// +/// This intrinsic corresponds to the VPDPWSSD instructions. +/// +/// \operation +/// FOR j := 0 to 7 +/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j]) +/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1]) +/// DST.dword[j] := S.dword[j] + tmp1 + tmp2 +/// ENDFOR +/// DST[MAX:256] := 0 +/// \endoperation +#define _mm256_dpwssd_epi32(S, A, B) \ + ((__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v8si)(A), (__v8si)(B))) + +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with +/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit +/// results. Sum these 2 results with the corresponding 32-bit integer in \a S +/// using signed saturation, and store the packed 32-bit results in DST. +/// +/// This intrinsic corresponds to the VPDPWSSDS instructions. +/// +/// \operation +/// FOR j := 0 to 7 +/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j]) +/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1]) +/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2) +/// ENDFOR +/// DST[MAX:256] := 0 +/// \endoperation +#define _mm256_dpwssds_epi32(S, A, B) \ + ((__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v8si)(A), (__v8si)(B))) + +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with +/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed +/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer +/// in \a S, and store the packed 32-bit results in DST. +/// +/// This intrinsic corresponds to the VPDPBUSD instructions. +/// +/// \operation +/// FOR j := 0 to 3 +/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j])) +/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1])) +/// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2])) +/// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3])) +/// DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 +/// ENDFOR +/// DST[MAX:128] := 0 +/// \endoperation +#define _mm_dpbusd_epi32(S, A, B) \ + ((__m128i)__builtin_ia32_vpdpbusd128((__v4si)(S), (__v4si)(A), (__v4si)(B))) + +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with +/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed +/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer +/// in \a S using signed saturation, and store the packed 32-bit results in DST. +/// +/// This intrinsic corresponds to the VPDPBUSDS instructions. +/// +/// \operation +/// FOR j := 0 to 3 +/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j])) +/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1])) +/// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2])) +/// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3])) +/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) +/// ENDFOR +/// DST[MAX:128] := 0 +/// \endoperation +#define _mm_dpbusds_epi32(S, A, B) \ + ((__m128i)__builtin_ia32_vpdpbusds128((__v4si)(S), (__v4si)(A), (__v4si)(B))) + +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with +/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit +/// results. Sum these 2 results with the corresponding 32-bit integer in \a S, +/// and store the packed 32-bit results in DST. +/// +/// This intrinsic corresponds to the VPDPWSSD instructions. +/// +/// \operation +/// FOR j := 0 to 3 +/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j]) +/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1]) +/// DST.dword[j] := S.dword[j] + tmp1 + tmp2 +/// ENDFOR +/// DST[MAX:128] := 0 +/// \endoperation +#define _mm_dpwssd_epi32(S, A, B) \ + ((__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v4si)(A), (__v4si)(B))) + +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with +/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit +/// results. Sum these 2 results with the corresponding 32-bit integer in \a S +/// using signed saturation, and store the packed 32-bit results in DST. +/// +/// This intrinsic corresponds to the VPDPWSSDS instructions. +/// +/// \operation +/// FOR j := 0 to 3 +/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j]) +/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1]) +/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2) +/// ENDFOR +/// DST[MAX:128] := 0 +/// \endoperation +#define _mm_dpwssds_epi32(S, A, B) \ + ((__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v4si)(A), (__v4si)(B))) + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectd_256(__U, + (__v8si)_mm256_dpbusd_epi32(__S, __A, __B), + (__v8si)__S); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectd_256(__U, + (__v8si)_mm256_dpbusd_epi32(__S, __A, __B), + (__v8si)_mm256_setzero_si256()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectd_256(__U, + (__v8si)_mm256_dpbusds_epi32(__S, __A, __B), + (__v8si)__S); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectd_256(__U, + (__v8si)_mm256_dpbusds_epi32(__S, __A, __B), + (__v8si)_mm256_setzero_si256()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectd_256(__U, + (__v8si)_mm256_dpwssd_epi32(__S, __A, __B), + (__v8si)__S); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectd_256(__U, + (__v8si)_mm256_dpwssd_epi32(__S, __A, __B), + (__v8si)_mm256_setzero_si256()); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectd_256(__U, + (__v8si)_mm256_dpwssds_epi32(__S, __A, __B), + (__v8si)__S); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_selectd_256(__U, + (__v8si)_mm256_dpwssds_epi32(__S, __A, __B), + (__v8si)_mm256_setzero_si256()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectd_128(__U, + (__v4si)_mm_dpbusd_epi32(__S, __A, __B), + (__v4si)__S); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectd_128(__U, + (__v4si)_mm_dpbusd_epi32(__S, __A, __B), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectd_128(__U, + (__v4si)_mm_dpbusds_epi32(__S, __A, __B), + (__v4si)__S); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectd_128(__U, + (__v4si)_mm_dpbusds_epi32(__S, __A, __B), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectd_128(__U, + (__v4si)_mm_dpwssd_epi32(__S, __A, __B), + (__v4si)__S); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectd_128(__U, + (__v4si)_mm_dpwssd_epi32(__S, __A, __B), + (__v4si)_mm_setzero_si128()); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectd_128(__U, + (__v4si)_mm_dpwssds_epi32(__S, __A, __B), + (__v4si)__S); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_dpwssds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_selectd_128(__U, + (__v4si)_mm_dpwssds_epi32(__S, __A, __B), + (__v4si)_mm_setzero_si128()); +} + +#undef __DEFAULT_FN_ATTRS128 +#undef __DEFAULT_FN_ATTRS256 + +#endif diff --git a/include-llvm/avx512vlvp2intersectintrin.h b/include-llvm/avx512vlvp2intersectintrin.h new file mode 100644 index 0000000..3e0815e --- /dev/null +++ b/include-llvm/avx512vlvp2intersectintrin.h @@ -0,0 +1,121 @@ +/*===------ avx512vlvp2intersectintrin.h - VL VP2INTERSECT intrinsics ------=== + * + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef _AVX512VLVP2INTERSECT_H +#define _AVX512VLVP2INTERSECT_H + +#define __DEFAULT_FN_ATTRS128 \ + __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vp2intersect"), \ + __min_vector_width__(128))) + +#define __DEFAULT_FN_ATTRS256 \ + __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vp2intersect"), \ + __min_vector_width__(256))) +/// Store, in an even/odd pair of mask registers, the indicators of the +/// locations of value matches between dwords in operands __a and __b. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VP2INTERSECTD instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x i32]. +/// \param __b +/// A 256-bit vector of [8 x i32] +/// \param __m0 +/// A pointer point to 8-bit mask +/// \param __m1 +/// A pointer point to 8-bit mask +static __inline__ void __DEFAULT_FN_ATTRS256 +_mm256_2intersect_epi32(__m256i __a, __m256i __b, __mmask8 *__m0, __mmask8 *__m1) { + __builtin_ia32_vp2intersect_d_256((__v8si)__a, (__v8si)__b, __m0, __m1); +} + +/// Store, in an even/odd pair of mask registers, the indicators of the +/// locations of value matches between quadwords in operands __a and __b. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VP2INTERSECTQ instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x i64]. +/// \param __b +/// A 256-bit vector of [4 x i64] +/// \param __m0 +/// A pointer point to 8-bit mask +/// \param __m1 +/// A pointer point to 8-bit mask +static __inline__ void __DEFAULT_FN_ATTRS256 +_mm256_2intersect_epi64(__m256i __a, __m256i __b, __mmask8 *__m0, __mmask8 *__m1) { + __builtin_ia32_vp2intersect_q_256((__v4di)__a, (__v4di)__b, __m0, __m1); +} + +/// Store, in an even/odd pair of mask registers, the indicators of the +/// locations of value matches between dwords in operands __a and __b. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VP2INTERSECTD instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x i32]. +/// \param __b +/// A 128-bit vector of [4 x i32] +/// \param __m0 +/// A pointer point to 8-bit mask +/// \param __m1 +/// A pointer point to 8-bit mask +static __inline__ void __DEFAULT_FN_ATTRS128 +_mm_2intersect_epi32(__m128i __a, __m128i __b, __mmask8 *__m0, __mmask8 *__m1) { + __builtin_ia32_vp2intersect_d_128((__v4si)__a, (__v4si)__b, __m0, __m1); +} + +/// Store, in an even/odd pair of mask registers, the indicators of the +/// locations of value matches between quadwords in operands __a and __b. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VP2INTERSECTQ instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x i64]. +/// \param __b +/// A 128-bit vector of [2 x i64] +/// \param __m0 +/// A pointer point to 8-bit mask +/// \param __m1 +/// A pointer point to 8-bit mask +static __inline__ void __DEFAULT_FN_ATTRS128 +_mm_2intersect_epi64(__m128i __a, __m128i __b, __mmask8 *__m0, __mmask8 *__m1) { + __builtin_ia32_vp2intersect_q_128((__v2di)__a, (__v2di)__b, __m0, __m1); +} + +#undef __DEFAULT_FN_ATTRS128 +#undef __DEFAULT_FN_ATTRS256 + +#endif diff --git a/include-llvm/avx512vnniintrin.h b/include-llvm/avx512vnniintrin.h new file mode 100644 index 0000000..9935a11 --- /dev/null +++ b/include-llvm/avx512vnniintrin.h @@ -0,0 +1,115 @@ +/*===------------- avx512vnniintrin.h - VNNI intrinsics ------------------=== + * + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __AVX512VNNIINTRIN_H +#define __AVX512VNNIINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vnni"), __min_vector_width__(512))) + + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_dpbusd_epi32(__m512i __S, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_vpdpbusd512((__v16si)__S, (__v16si)__A, + (__v16si)__B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_dpbusd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectd_512(__U, + (__v16si)_mm512_dpbusd_epi32(__S, __A, __B), + (__v16si)__S); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_dpbusd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectd_512(__U, + (__v16si)_mm512_dpbusd_epi32(__S, __A, __B), + (__v16si)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_dpbusds_epi32(__m512i __S, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_vpdpbusds512((__v16si)__S, (__v16si)__A, + (__v16si)__B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_dpbusds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectd_512(__U, + (__v16si)_mm512_dpbusds_epi32(__S, __A, __B), + (__v16si)__S); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_dpbusds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectd_512(__U, + (__v16si)_mm512_dpbusds_epi32(__S, __A, __B), + (__v16si)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_dpwssd_epi32(__m512i __S, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_vpdpwssd512((__v16si)__S, (__v16si)__A, + (__v16si)__B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_dpwssd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectd_512(__U, + (__v16si)_mm512_dpwssd_epi32(__S, __A, __B), + (__v16si)__S); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_dpwssd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectd_512(__U, + (__v16si)_mm512_dpwssd_epi32(__S, __A, __B), + (__v16si)_mm512_setzero_si512()); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_dpwssds_epi32(__m512i __S, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_vpdpwssds512((__v16si)__S, (__v16si)__A, + (__v16si)__B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_dpwssds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectd_512(__U, + (__v16si)_mm512_dpwssds_epi32(__S, __A, __B), + (__v16si)__S); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_dpwssds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) +{ + return (__m512i)__builtin_ia32_selectd_512(__U, + (__v16si)_mm512_dpwssds_epi32(__S, __A, __B), + (__v16si)_mm512_setzero_si512()); +} + +#undef __DEFAULT_FN_ATTRS + +#endif diff --git a/include-llvm/avx512vp2intersectintrin.h b/include-llvm/avx512vp2intersectintrin.h new file mode 100644 index 0000000..5d3cb48 --- /dev/null +++ b/include-llvm/avx512vp2intersectintrin.h @@ -0,0 +1,77 @@ +/*===------- avx512vpintersectintrin.h - VP2INTERSECT intrinsics ------------=== + * + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef _AVX512VP2INTERSECT_H +#define _AVX512VP2INTERSECT_H + +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("avx512vp2intersect"), \ + __min_vector_width__(512))) + +/// Store, in an even/odd pair of mask registers, the indicators of the +/// locations of value matches between dwords in operands __a and __b. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VP2INTERSECTD instruction. +/// +/// \param __a +/// A 512-bit vector of [16 x i32]. +/// \param __b +/// A 512-bit vector of [16 x i32] +/// \param __m0 +/// A pointer point to 16-bit mask +/// \param __m1 +/// A pointer point to 16-bit mask +static __inline__ void __DEFAULT_FN_ATTRS +_mm512_2intersect_epi32(__m512i __a, __m512i __b, __mmask16 *__m0, __mmask16 *__m1) { + __builtin_ia32_vp2intersect_d_512((__v16si)__a, (__v16si)__b, __m0, __m1); +} + +/// Store, in an even/odd pair of mask registers, the indicators of the +/// locations of value matches between quadwords in operands __a and __b. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VP2INTERSECTQ instruction. +/// +/// \param __a +/// A 512-bit vector of [8 x i64]. +/// \param __b +/// A 512-bit vector of [8 x i64] +/// \param __m0 +/// A pointer point to 8-bit mask +/// \param __m1 +/// A pointer point to 8-bit mask +static __inline__ void __DEFAULT_FN_ATTRS +_mm512_2intersect_epi64(__m512i __a, __m512i __b, __mmask8 *__m0, __mmask8 *__m1) { + __builtin_ia32_vp2intersect_q_512((__v8di)__a, (__v8di)__b, __m0, __m1); +} + +#undef __DEFAULT_FN_ATTRS + +#endif diff --git a/include-llvm/avx512vpopcntdqintrin.h b/include-llvm/avx512vpopcntdqintrin.h new file mode 100644 index 0000000..bb435e6 --- /dev/null +++ b/include-llvm/avx512vpopcntdqintrin.h @@ -0,0 +1,54 @@ +/*===----- avx512vpopcntdqintrin.h - AVX512VPOPCNTDQ intrinsics-------------=== + * + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error \ + "Never use directly; include instead." +#endif + +#ifndef __AVX512VPOPCNTDQINTRIN_H +#define __AVX512VPOPCNTDQINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("avx512vpopcntdq"), __min_vector_width__(512))) + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_popcnt_epi64(__m512i __A) { + return (__m512i)__builtin_ia32_vpopcntq_512((__v8di)__A); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_popcnt_epi64(__m512i __W, __mmask8 __U, __m512i __A) { + return (__m512i)__builtin_ia32_selectq_512( + (__mmask8)__U, (__v8di)_mm512_popcnt_epi64(__A), (__v8di)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_popcnt_epi64(__mmask8 __U, __m512i __A) { + return _mm512_mask_popcnt_epi64((__m512i)_mm512_setzero_si512(), __U, __A); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_popcnt_epi32(__m512i __A) { + return (__m512i)__builtin_ia32_vpopcntd_512((__v16si)__A); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_mask_popcnt_epi32(__m512i __W, __mmask16 __U, __m512i __A) { + return (__m512i)__builtin_ia32_selectd_512( + (__mmask16)__U, (__v16si)_mm512_popcnt_epi32(__A), (__v16si)__W); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS +_mm512_maskz_popcnt_epi32(__mmask16 __U, __m512i __A) { + return _mm512_mask_popcnt_epi32((__m512i)_mm512_setzero_si512(), __U, __A); +} + +#undef __DEFAULT_FN_ATTRS + +#endif diff --git a/include-llvm/avx512vpopcntdqvlintrin.h b/include-llvm/avx512vpopcntdqvlintrin.h new file mode 100644 index 0000000..a3cb9b6 --- /dev/null +++ b/include-llvm/avx512vpopcntdqvlintrin.h @@ -0,0 +1,91 @@ +/*===---- avx512vpopcntdqintrin.h - AVX512VPOPCNTDQ intrinsics -------------=== + * + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error \ + "Never use directly; include instead." +#endif + +#ifndef __AVX512VPOPCNTDQVLINTRIN_H +#define __AVX512VPOPCNTDQVLINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS128 \ + __attribute__((__always_inline__, __nodebug__, __target__("avx512vpopcntdq,avx512vl"), __min_vector_width__(128))) +#define __DEFAULT_FN_ATTRS256 \ + __attribute__((__always_inline__, __nodebug__, __target__("avx512vpopcntdq,avx512vl"), __min_vector_width__(256))) + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_popcnt_epi64(__m128i __A) { + return (__m128i)__builtin_ia32_vpopcntq_128((__v2di)__A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_popcnt_epi64(__m128i __W, __mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_selectq_128( + (__mmask8)__U, (__v2di)_mm_popcnt_epi64(__A), (__v2di)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_popcnt_epi64(__mmask8 __U, __m128i __A) { + return _mm_mask_popcnt_epi64((__m128i)_mm_setzero_si128(), __U, __A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_popcnt_epi32(__m128i __A) { + return (__m128i)__builtin_ia32_vpopcntd_128((__v4si)__A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_mask_popcnt_epi32(__m128i __W, __mmask8 __U, __m128i __A) { + return (__m128i)__builtin_ia32_selectd_128( + (__mmask8)__U, (__v4si)_mm_popcnt_epi32(__A), (__v4si)__W); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_maskz_popcnt_epi32(__mmask8 __U, __m128i __A) { + return _mm_mask_popcnt_epi32((__m128i)_mm_setzero_si128(), __U, __A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_popcnt_epi64(__m256i __A) { + return (__m256i)__builtin_ia32_vpopcntq_256((__v4di)__A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_popcnt_epi64(__m256i __W, __mmask8 __U, __m256i __A) { + return (__m256i)__builtin_ia32_selectq_256( + (__mmask8)__U, (__v4di)_mm256_popcnt_epi64(__A), (__v4di)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_popcnt_epi64(__mmask8 __U, __m256i __A) { + return _mm256_mask_popcnt_epi64((__m256i)_mm256_setzero_si256(), __U, __A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_popcnt_epi32(__m256i __A) { + return (__m256i)__builtin_ia32_vpopcntd_256((__v8si)__A); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_mask_popcnt_epi32(__m256i __W, __mmask8 __U, __m256i __A) { + return (__m256i)__builtin_ia32_selectd_256( + (__mmask8)__U, (__v8si)_mm256_popcnt_epi32(__A), (__v8si)__W); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_maskz_popcnt_epi32(__mmask8 __U, __m256i __A) { + return _mm256_mask_popcnt_epi32((__m256i)_mm256_setzero_si256(), __U, __A); +} + +#undef __DEFAULT_FN_ATTRS128 +#undef __DEFAULT_FN_ATTRS256 + +#endif diff --git a/include-llvm/avxintrin.h b/include-llvm/avxintrin.h new file mode 100644 index 0000000..2f2a159 --- /dev/null +++ b/include-llvm/avxintrin.h @@ -0,0 +1,5062 @@ +/*===---- avxintrin.h - AVX intrinsics -------------------------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __AVXINTRIN_H +#define __AVXINTRIN_H + +typedef double __v4df __attribute__ ((__vector_size__ (32))); +typedef float __v8sf __attribute__ ((__vector_size__ (32))); +typedef long long __v4di __attribute__ ((__vector_size__ (32))); +typedef int __v8si __attribute__ ((__vector_size__ (32))); +typedef short __v16hi __attribute__ ((__vector_size__ (32))); +typedef char __v32qi __attribute__ ((__vector_size__ (32))); + +/* Unsigned types */ +typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32))); +typedef unsigned int __v8su __attribute__ ((__vector_size__ (32))); +typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32))); +typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32))); + +/* We need an explicitly signed variant for char. Note that this shouldn't + * appear in the interface though. */ +typedef signed char __v32qs __attribute__((__vector_size__(32))); + +typedef float __m256 __attribute__ ((__vector_size__ (32), __aligned__(32))); +typedef double __m256d __attribute__((__vector_size__(32), __aligned__(32))); +typedef long long __m256i __attribute__((__vector_size__(32), __aligned__(32))); + +typedef float __m256_u __attribute__ ((__vector_size__ (32), __aligned__(1))); +typedef double __m256d_u __attribute__((__vector_size__(32), __aligned__(1))); +typedef long long __m256i_u __attribute__((__vector_size__(32), __aligned__(1))); + +#if (__clang_major__ > 15) +#ifdef __SSE2__ +/* Both _Float16 and __bf16 require SSE2 being enabled. */ +typedef _Float16 __v16hf __attribute__((__vector_size__(32), __aligned__(32))); +typedef _Float16 __m256h __attribute__((__vector_size__(32), __aligned__(32))); +typedef _Float16 __m256h_u __attribute__((__vector_size__(32), __aligned__(1))); + +typedef __bf16 __v16bf __attribute__((__vector_size__(32), __aligned__(32))); +typedef __bf16 __m256bh __attribute__((__vector_size__(32), __aligned__(32))); +#endif +#endif + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx"), __min_vector_width__(256))) +#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx"), __min_vector_width__(128))) + +/* Arithmetic */ +/// Adds two 256-bit vectors of [4 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VADDPD instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double] containing one of the source operands. +/// \param __b +/// A 256-bit vector of [4 x double] containing one of the source operands. +/// \returns A 256-bit vector of [4 x double] containing the sums of both +/// operands. +static __inline __m256d __DEFAULT_FN_ATTRS +_mm256_add_pd(__m256d __a, __m256d __b) +{ + return (__m256d)((__v4df)__a+(__v4df)__b); +} + +/// Adds two 256-bit vectors of [8 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VADDPS instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float] containing one of the source operands. +/// \param __b +/// A 256-bit vector of [8 x float] containing one of the source operands. +/// \returns A 256-bit vector of [8 x float] containing the sums of both +/// operands. +static __inline __m256 __DEFAULT_FN_ATTRS +_mm256_add_ps(__m256 __a, __m256 __b) +{ + return (__m256)((__v8sf)__a+(__v8sf)__b); +} + +/// Subtracts two 256-bit vectors of [4 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VSUBPD instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double] containing the minuend. +/// \param __b +/// A 256-bit vector of [4 x double] containing the subtrahend. +/// \returns A 256-bit vector of [4 x double] containing the differences between +/// both operands. +static __inline __m256d __DEFAULT_FN_ATTRS +_mm256_sub_pd(__m256d __a, __m256d __b) +{ + return (__m256d)((__v4df)__a-(__v4df)__b); +} + +/// Subtracts two 256-bit vectors of [8 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VSUBPS instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float] containing the minuend. +/// \param __b +/// A 256-bit vector of [8 x float] containing the subtrahend. +/// \returns A 256-bit vector of [8 x float] containing the differences between +/// both operands. +static __inline __m256 __DEFAULT_FN_ATTRS +_mm256_sub_ps(__m256 __a, __m256 __b) +{ + return (__m256)((__v8sf)__a-(__v8sf)__b); +} + +/// Adds the even-indexed values and subtracts the odd-indexed values of +/// two 256-bit vectors of [4 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VADDSUBPD instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double] containing the left source operand. +/// \param __b +/// A 256-bit vector of [4 x double] containing the right source operand. +/// \returns A 256-bit vector of [4 x double] containing the alternating sums +/// and differences between both operands. +static __inline __m256d __DEFAULT_FN_ATTRS +_mm256_addsub_pd(__m256d __a, __m256d __b) +{ + return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b); +} + +/// Adds the even-indexed values and subtracts the odd-indexed values of +/// two 256-bit vectors of [8 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VADDSUBPS instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float] containing the left source operand. +/// \param __b +/// A 256-bit vector of [8 x float] containing the right source operand. +/// \returns A 256-bit vector of [8 x float] containing the alternating sums and +/// differences between both operands. +static __inline __m256 __DEFAULT_FN_ATTRS +_mm256_addsub_ps(__m256 __a, __m256 __b) +{ + return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b); +} + +/// Divides two 256-bit vectors of [4 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VDIVPD instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double] containing the dividend. +/// \param __b +/// A 256-bit vector of [4 x double] containing the divisor. +/// \returns A 256-bit vector of [4 x double] containing the quotients of both +/// operands. +static __inline __m256d __DEFAULT_FN_ATTRS +_mm256_div_pd(__m256d __a, __m256d __b) +{ + return (__m256d)((__v4df)__a/(__v4df)__b); +} + +/// Divides two 256-bit vectors of [8 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VDIVPS instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float] containing the dividend. +/// \param __b +/// A 256-bit vector of [8 x float] containing the divisor. +/// \returns A 256-bit vector of [8 x float] containing the quotients of both +/// operands. +static __inline __m256 __DEFAULT_FN_ATTRS +_mm256_div_ps(__m256 __a, __m256 __b) +{ + return (__m256)((__v8sf)__a/(__v8sf)__b); +} + +/// Compares two 256-bit vectors of [4 x double] and returns the greater +/// of each pair of values. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMAXPD instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double] containing one of the operands. +/// \param __b +/// A 256-bit vector of [4 x double] containing one of the operands. +/// \returns A 256-bit vector of [4 x double] containing the maximum values +/// between both operands. +static __inline __m256d __DEFAULT_FN_ATTRS +_mm256_max_pd(__m256d __a, __m256d __b) +{ + return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b); +} + +/// Compares two 256-bit vectors of [8 x float] and returns the greater +/// of each pair of values. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMAXPS instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float] containing one of the operands. +/// \param __b +/// A 256-bit vector of [8 x float] containing one of the operands. +/// \returns A 256-bit vector of [8 x float] containing the maximum values +/// between both operands. +static __inline __m256 __DEFAULT_FN_ATTRS +_mm256_max_ps(__m256 __a, __m256 __b) +{ + return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b); +} + +/// Compares two 256-bit vectors of [4 x double] and returns the lesser +/// of each pair of values. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMINPD instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double] containing one of the operands. +/// \param __b +/// A 256-bit vector of [4 x double] containing one of the operands. +/// \returns A 256-bit vector of [4 x double] containing the minimum values +/// between both operands. +static __inline __m256d __DEFAULT_FN_ATTRS +_mm256_min_pd(__m256d __a, __m256d __b) +{ + return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b); +} + +/// Compares two 256-bit vectors of [8 x float] and returns the lesser +/// of each pair of values. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMINPS instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float] containing one of the operands. +/// \param __b +/// A 256-bit vector of [8 x float] containing one of the operands. +/// \returns A 256-bit vector of [8 x float] containing the minimum values +/// between both operands. +static __inline __m256 __DEFAULT_FN_ATTRS +_mm256_min_ps(__m256 __a, __m256 __b) +{ + return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b); +} + +/// Multiplies two 256-bit vectors of [4 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMULPD instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double] containing one of the operands. +/// \param __b +/// A 256-bit vector of [4 x double] containing one of the operands. +/// \returns A 256-bit vector of [4 x double] containing the products of both +/// operands. +static __inline __m256d __DEFAULT_FN_ATTRS +_mm256_mul_pd(__m256d __a, __m256d __b) +{ + return (__m256d)((__v4df)__a * (__v4df)__b); +} + +/// Multiplies two 256-bit vectors of [8 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMULPS instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float] containing one of the operands. +/// \param __b +/// A 256-bit vector of [8 x float] containing one of the operands. +/// \returns A 256-bit vector of [8 x float] containing the products of both +/// operands. +static __inline __m256 __DEFAULT_FN_ATTRS +_mm256_mul_ps(__m256 __a, __m256 __b) +{ + return (__m256)((__v8sf)__a * (__v8sf)__b); +} + +/// Calculates the square roots of the values in a 256-bit vector of +/// [4 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VSQRTPD instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double]. +/// \returns A 256-bit vector of [4 x double] containing the square roots of the +/// values in the operand. +static __inline __m256d __DEFAULT_FN_ATTRS +_mm256_sqrt_pd(__m256d __a) +{ + return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a); +} + +/// Calculates the square roots of the values in a 256-bit vector of +/// [8 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VSQRTPS instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float]. +/// \returns A 256-bit vector of [8 x float] containing the square roots of the +/// values in the operand. +static __inline __m256 __DEFAULT_FN_ATTRS +_mm256_sqrt_ps(__m256 __a) +{ + return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a); +} + +/// Calculates the reciprocal square roots of the values in a 256-bit +/// vector of [8 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VRSQRTPS instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float]. +/// \returns A 256-bit vector of [8 x float] containing the reciprocal square +/// roots of the values in the operand. +static __inline __m256 __DEFAULT_FN_ATTRS +_mm256_rsqrt_ps(__m256 __a) +{ + return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a); +} + +/// Calculates the reciprocals of the values in a 256-bit vector of +/// [8 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VRCPPS instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float]. +/// \returns A 256-bit vector of [8 x float] containing the reciprocals of the +/// values in the operand. +static __inline __m256 __DEFAULT_FN_ATTRS +_mm256_rcp_ps(__m256 __a) +{ + return (__m256)__builtin_ia32_rcpps256((__v8sf)__a); +} + +/// Rounds the values in a 256-bit vector of [4 x double] as specified +/// by the byte operand. The source values are rounded to integer values and +/// returned as 64-bit double-precision floating-point values. +/// +/// \headerfile +/// +/// \code +/// __m256d _mm256_round_pd(__m256d V, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the VROUNDPD instruction. +/// +/// \param V +/// A 256-bit vector of [4 x double]. +/// \param M +/// An integer value that specifies the rounding operation. \n +/// Bits [7:4] are reserved. \n +/// Bit [3] is a precision exception value: \n +/// 0: A normal PE exception is used. \n +/// 1: The PE field is not updated. \n +/// Bit [2] is the rounding control source: \n +/// 0: Use bits [1:0] of \a M. \n +/// 1: Use the current MXCSR setting. \n +/// Bits [1:0] contain the rounding control definition: \n +/// 00: Nearest. \n +/// 01: Downward (toward negative infinity). \n +/// 10: Upward (toward positive infinity). \n +/// 11: Truncated. +/// \returns A 256-bit vector of [4 x double] containing the rounded values. +#define _mm256_round_pd(V, M) \ + ((__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M))) + +/// Rounds the values stored in a 256-bit vector of [8 x float] as +/// specified by the byte operand. The source values are rounded to integer +/// values and returned as floating-point values. +/// +/// \headerfile +/// +/// \code +/// __m256 _mm256_round_ps(__m256 V, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the VROUNDPS instruction. +/// +/// \param V +/// A 256-bit vector of [8 x float]. +/// \param M +/// An integer value that specifies the rounding operation. \n +/// Bits [7:4] are reserved. \n +/// Bit [3] is a precision exception value: \n +/// 0: A normal PE exception is used. \n +/// 1: The PE field is not updated. \n +/// Bit [2] is the rounding control source: \n +/// 0: Use bits [1:0] of \a M. \n +/// 1: Use the current MXCSR setting. \n +/// Bits [1:0] contain the rounding control definition: \n +/// 00: Nearest. \n +/// 01: Downward (toward negative infinity). \n +/// 10: Upward (toward positive infinity). \n +/// 11: Truncated. +/// \returns A 256-bit vector of [8 x float] containing the rounded values. +#define _mm256_round_ps(V, M) \ + ((__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M))) + +/// Rounds up the values stored in a 256-bit vector of [4 x double]. The +/// source values are rounded up to integer values and returned as 64-bit +/// double-precision floating-point values. +/// +/// \headerfile +/// +/// \code +/// __m256d _mm256_ceil_pd(__m256d V); +/// \endcode +/// +/// This intrinsic corresponds to the VROUNDPD instruction. +/// +/// \param V +/// A 256-bit vector of [4 x double]. +/// \returns A 256-bit vector of [4 x double] containing the rounded up values. +#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL) + +/// Rounds down the values stored in a 256-bit vector of [4 x double]. +/// The source values are rounded down to integer values and returned as +/// 64-bit double-precision floating-point values. +/// +/// \headerfile +/// +/// \code +/// __m256d _mm256_floor_pd(__m256d V); +/// \endcode +/// +/// This intrinsic corresponds to the VROUNDPD instruction. +/// +/// \param V +/// A 256-bit vector of [4 x double]. +/// \returns A 256-bit vector of [4 x double] containing the rounded down +/// values. +#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR) + +/// Rounds up the values stored in a 256-bit vector of [8 x float]. The +/// source values are rounded up to integer values and returned as +/// floating-point values. +/// +/// \headerfile +/// +/// \code +/// __m256 _mm256_ceil_ps(__m256 V); +/// \endcode +/// +/// This intrinsic corresponds to the VROUNDPS instruction. +/// +/// \param V +/// A 256-bit vector of [8 x float]. +/// \returns A 256-bit vector of [8 x float] containing the rounded up values. +#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL) + +/// Rounds down the values stored in a 256-bit vector of [8 x float]. The +/// source values are rounded down to integer values and returned as +/// floating-point values. +/// +/// \headerfile +/// +/// \code +/// __m256 _mm256_floor_ps(__m256 V); +/// \endcode +/// +/// This intrinsic corresponds to the VROUNDPS instruction. +/// +/// \param V +/// A 256-bit vector of [8 x float]. +/// \returns A 256-bit vector of [8 x float] containing the rounded down values. +#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR) + +/* Logical */ +/// Performs a bitwise AND of two 256-bit vectors of [4 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VANDPD instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double] containing one of the source operands. +/// \param __b +/// A 256-bit vector of [4 x double] containing one of the source operands. +/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the +/// values between both operands. +static __inline __m256d __DEFAULT_FN_ATTRS +_mm256_and_pd(__m256d __a, __m256d __b) +{ + return (__m256d)((__v4du)__a & (__v4du)__b); +} + +/// Performs a bitwise AND of two 256-bit vectors of [8 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VANDPS instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float] containing one of the source operands. +/// \param __b +/// A 256-bit vector of [8 x float] containing one of the source operands. +/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the +/// values between both operands. +static __inline __m256 __DEFAULT_FN_ATTRS +_mm256_and_ps(__m256 __a, __m256 __b) +{ + return (__m256)((__v8su)__a & (__v8su)__b); +} + +/// Performs a bitwise AND of two 256-bit vectors of [4 x double], using +/// the one's complement of the values contained in the first source operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VANDNPD instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double] containing the left source operand. The +/// one's complement of this value is used in the bitwise AND. +/// \param __b +/// A 256-bit vector of [4 x double] containing the right source operand. +/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the +/// values of the second operand and the one's complement of the first +/// operand. +static __inline __m256d __DEFAULT_FN_ATTRS +_mm256_andnot_pd(__m256d __a, __m256d __b) +{ + return (__m256d)(~(__v4du)__a & (__v4du)__b); +} + +/// Performs a bitwise AND of two 256-bit vectors of [8 x float], using +/// the one's complement of the values contained in the first source operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VANDNPS instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float] containing the left source operand. The +/// one's complement of this value is used in the bitwise AND. +/// \param __b +/// A 256-bit vector of [8 x float] containing the right source operand. +/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the +/// values of the second operand and the one's complement of the first +/// operand. +static __inline __m256 __DEFAULT_FN_ATTRS +_mm256_andnot_ps(__m256 __a, __m256 __b) +{ + return (__m256)(~(__v8su)__a & (__v8su)__b); +} + +/// Performs a bitwise OR of two 256-bit vectors of [4 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VORPD instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double] containing one of the source operands. +/// \param __b +/// A 256-bit vector of [4 x double] containing one of the source operands. +/// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the +/// values between both operands. +static __inline __m256d __DEFAULT_FN_ATTRS +_mm256_or_pd(__m256d __a, __m256d __b) +{ + return (__m256d)((__v4du)__a | (__v4du)__b); +} + +/// Performs a bitwise OR of two 256-bit vectors of [8 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VORPS instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float] containing one of the source operands. +/// \param __b +/// A 256-bit vector of [8 x float] containing one of the source operands. +/// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the +/// values between both operands. +static __inline __m256 __DEFAULT_FN_ATTRS +_mm256_or_ps(__m256 __a, __m256 __b) +{ + return (__m256)((__v8su)__a | (__v8su)__b); +} + +/// Performs a bitwise XOR of two 256-bit vectors of [4 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VXORPD instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double] containing one of the source operands. +/// \param __b +/// A 256-bit vector of [4 x double] containing one of the source operands. +/// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the +/// values between both operands. +static __inline __m256d __DEFAULT_FN_ATTRS +_mm256_xor_pd(__m256d __a, __m256d __b) +{ + return (__m256d)((__v4du)__a ^ (__v4du)__b); +} + +/// Performs a bitwise XOR of two 256-bit vectors of [8 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VXORPS instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float] containing one of the source operands. +/// \param __b +/// A 256-bit vector of [8 x float] containing one of the source operands. +/// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the +/// values between both operands. +static __inline __m256 __DEFAULT_FN_ATTRS +_mm256_xor_ps(__m256 __a, __m256 __b) +{ + return (__m256)((__v8su)__a ^ (__v8su)__b); +} + +/* Horizontal arithmetic */ +/// Horizontally adds the adjacent pairs of values contained in two +/// 256-bit vectors of [4 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VHADDPD instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double] containing one of the source operands. +/// The horizontal sums of the values are returned in the even-indexed +/// elements of a vector of [4 x double]. +/// \param __b +/// A 256-bit vector of [4 x double] containing one of the source operands. +/// The horizontal sums of the values are returned in the odd-indexed +/// elements of a vector of [4 x double]. +/// \returns A 256-bit vector of [4 x double] containing the horizontal sums of +/// both operands. +static __inline __m256d __DEFAULT_FN_ATTRS +_mm256_hadd_pd(__m256d __a, __m256d __b) +{ + return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b); +} + +/// Horizontally adds the adjacent pairs of values contained in two +/// 256-bit vectors of [8 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VHADDPS instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float] containing one of the source operands. +/// The horizontal sums of the values are returned in the elements with +/// index 0, 1, 4, 5 of a vector of [8 x float]. +/// \param __b +/// A 256-bit vector of [8 x float] containing one of the source operands. +/// The horizontal sums of the values are returned in the elements with +/// index 2, 3, 6, 7 of a vector of [8 x float]. +/// \returns A 256-bit vector of [8 x float] containing the horizontal sums of +/// both operands. +static __inline __m256 __DEFAULT_FN_ATTRS +_mm256_hadd_ps(__m256 __a, __m256 __b) +{ + return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b); +} + +/// Horizontally subtracts the adjacent pairs of values contained in two +/// 256-bit vectors of [4 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VHSUBPD instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double] containing one of the source operands. +/// The horizontal differences between the values are returned in the +/// even-indexed elements of a vector of [4 x double]. +/// \param __b +/// A 256-bit vector of [4 x double] containing one of the source operands. +/// The horizontal differences between the values are returned in the +/// odd-indexed elements of a vector of [4 x double]. +/// \returns A 256-bit vector of [4 x double] containing the horizontal +/// differences of both operands. +static __inline __m256d __DEFAULT_FN_ATTRS +_mm256_hsub_pd(__m256d __a, __m256d __b) +{ + return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b); +} + +/// Horizontally subtracts the adjacent pairs of values contained in two +/// 256-bit vectors of [8 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VHSUBPS instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float] containing one of the source operands. +/// The horizontal differences between the values are returned in the +/// elements with index 0, 1, 4, 5 of a vector of [8 x float]. +/// \param __b +/// A 256-bit vector of [8 x float] containing one of the source operands. +/// The horizontal differences between the values are returned in the +/// elements with index 2, 3, 6, 7 of a vector of [8 x float]. +/// \returns A 256-bit vector of [8 x float] containing the horizontal +/// differences of both operands. +static __inline __m256 __DEFAULT_FN_ATTRS +_mm256_hsub_ps(__m256 __a, __m256 __b) +{ + return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b); +} + +/* Vector permutations */ +/// Copies the values in a 128-bit vector of [2 x double] as specified +/// by the 128-bit integer vector operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPERMILPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. +/// \param __c +/// A 128-bit integer vector operand specifying how the values are to be +/// copied. \n +/// Bit [1]: \n +/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned +/// vector. \n +/// 1: Bits [127:64] of the source are copied to bits [63:0] of the +/// returned vector. \n +/// Bit [65]: \n +/// 0: Bits [63:0] of the source are copied to bits [127:64] of the +/// returned vector. \n +/// 1: Bits [127:64] of the source are copied to bits [127:64] of the +/// returned vector. +/// \returns A 128-bit vector of [2 x double] containing the copied values. +static __inline __m128d __DEFAULT_FN_ATTRS128 +_mm_permutevar_pd(__m128d __a, __m128i __c) +{ + return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c); +} + +/// Copies the values in a 256-bit vector of [4 x double] as specified +/// by the 256-bit integer vector operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPERMILPD instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double]. +/// \param __c +/// A 256-bit integer vector operand specifying how the values are to be +/// copied. \n +/// Bit [1]: \n +/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned +/// vector. \n +/// 1: Bits [127:64] of the source are copied to bits [63:0] of the +/// returned vector. \n +/// Bit [65]: \n +/// 0: Bits [63:0] of the source are copied to bits [127:64] of the +/// returned vector. \n +/// 1: Bits [127:64] of the source are copied to bits [127:64] of the +/// returned vector. \n +/// Bit [129]: \n +/// 0: Bits [191:128] of the source are copied to bits [191:128] of the +/// returned vector. \n +/// 1: Bits [255:192] of the source are copied to bits [191:128] of the +/// returned vector. \n +/// Bit [193]: \n +/// 0: Bits [191:128] of the source are copied to bits [255:192] of the +/// returned vector. \n +/// 1: Bits [255:192] of the source are copied to bits [255:192] of the +/// returned vector. +/// \returns A 256-bit vector of [4 x double] containing the copied values. +static __inline __m256d __DEFAULT_FN_ATTRS +_mm256_permutevar_pd(__m256d __a, __m256i __c) +{ + return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c); +} + +/// Copies the values stored in a 128-bit vector of [4 x float] as +/// specified by the 128-bit integer vector operand. +/// \headerfile +/// +/// This intrinsic corresponds to the VPERMILPS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. +/// \param __c +/// A 128-bit integer vector operand specifying how the values are to be +/// copied. \n +/// Bits [1:0]: \n +/// 00: Bits [31:0] of the source are copied to bits [31:0] of the +/// returned vector. \n +/// 01: Bits [63:32] of the source are copied to bits [31:0] of the +/// returned vector. \n +/// 10: Bits [95:64] of the source are copied to bits [31:0] of the +/// returned vector. \n +/// 11: Bits [127:96] of the source are copied to bits [31:0] of the +/// returned vector. \n +/// Bits [33:32]: \n +/// 00: Bits [31:0] of the source are copied to bits [63:32] of the +/// returned vector. \n +/// 01: Bits [63:32] of the source are copied to bits [63:32] of the +/// returned vector. \n +/// 10: Bits [95:64] of the source are copied to bits [63:32] of the +/// returned vector. \n +/// 11: Bits [127:96] of the source are copied to bits [63:32] of the +/// returned vector. \n +/// Bits [65:64]: \n +/// 00: Bits [31:0] of the source are copied to bits [95:64] of the +/// returned vector. \n +/// 01: Bits [63:32] of the source are copied to bits [95:64] of the +/// returned vector. \n +/// 10: Bits [95:64] of the source are copied to bits [95:64] of the +/// returned vector. \n +/// 11: Bits [127:96] of the source are copied to bits [95:64] of the +/// returned vector. \n +/// Bits [97:96]: \n +/// 00: Bits [31:0] of the source are copied to bits [127:96] of the +/// returned vector. \n +/// 01: Bits [63:32] of the source are copied to bits [127:96] of the +/// returned vector. \n +/// 10: Bits [95:64] of the source are copied to bits [127:96] of the +/// returned vector. \n +/// 11: Bits [127:96] of the source are copied to bits [127:96] of the +/// returned vector. +/// \returns A 128-bit vector of [4 x float] containing the copied values. +static __inline __m128 __DEFAULT_FN_ATTRS128 +_mm_permutevar_ps(__m128 __a, __m128i __c) +{ + return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c); +} + +/// Copies the values stored in a 256-bit vector of [8 x float] as +/// specified by the 256-bit integer vector operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPERMILPS instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float]. +/// \param __c +/// A 256-bit integer vector operand specifying how the values are to be +/// copied. \n +/// Bits [1:0]: \n +/// 00: Bits [31:0] of the source are copied to bits [31:0] of the +/// returned vector. \n +/// 01: Bits [63:32] of the source are copied to bits [31:0] of the +/// returned vector. \n +/// 10: Bits [95:64] of the source are copied to bits [31:0] of the +/// returned vector. \n +/// 11: Bits [127:96] of the source are copied to bits [31:0] of the +/// returned vector. \n +/// Bits [33:32]: \n +/// 00: Bits [31:0] of the source are copied to bits [63:32] of the +/// returned vector. \n +/// 01: Bits [63:32] of the source are copied to bits [63:32] of the +/// returned vector. \n +/// 10: Bits [95:64] of the source are copied to bits [63:32] of the +/// returned vector. \n +/// 11: Bits [127:96] of the source are copied to bits [63:32] of the +/// returned vector. \n +/// Bits [65:64]: \n +/// 00: Bits [31:0] of the source are copied to bits [95:64] of the +/// returned vector. \n +/// 01: Bits [63:32] of the source are copied to bits [95:64] of the +/// returned vector. \n +/// 10: Bits [95:64] of the source are copied to bits [95:64] of the +/// returned vector. \n +/// 11: Bits [127:96] of the source are copied to bits [95:64] of the +/// returned vector. \n +/// Bits [97:96]: \n +/// 00: Bits [31:0] of the source are copied to bits [127:96] of the +/// returned vector. \n +/// 01: Bits [63:32] of the source are copied to bits [127:96] of the +/// returned vector. \n +/// 10: Bits [95:64] of the source are copied to bits [127:96] of the +/// returned vector. \n +/// 11: Bits [127:96] of the source are copied to bits [127:96] of the +/// returned vector. \n +/// Bits [129:128]: \n +/// 00: Bits [159:128] of the source are copied to bits [159:128] of the +/// returned vector. \n +/// 01: Bits [191:160] of the source are copied to bits [159:128] of the +/// returned vector. \n +/// 10: Bits [223:192] of the source are copied to bits [159:128] of the +/// returned vector. \n +/// 11: Bits [255:224] of the source are copied to bits [159:128] of the +/// returned vector. \n +/// Bits [161:160]: \n +/// 00: Bits [159:128] of the source are copied to bits [191:160] of the +/// returned vector. \n +/// 01: Bits [191:160] of the source are copied to bits [191:160] of the +/// returned vector. \n +/// 10: Bits [223:192] of the source are copied to bits [191:160] of the +/// returned vector. \n +/// 11: Bits [255:224] of the source are copied to bits [191:160] of the +/// returned vector. \n +/// Bits [193:192]: \n +/// 00: Bits [159:128] of the source are copied to bits [223:192] of the +/// returned vector. \n +/// 01: Bits [191:160] of the source are copied to bits [223:192] of the +/// returned vector. \n +/// 10: Bits [223:192] of the source are copied to bits [223:192] of the +/// returned vector. \n +/// 11: Bits [255:224] of the source are copied to bits [223:192] of the +/// returned vector. \n +/// Bits [225:224]: \n +/// 00: Bits [159:128] of the source are copied to bits [255:224] of the +/// returned vector. \n +/// 01: Bits [191:160] of the source are copied to bits [255:224] of the +/// returned vector. \n +/// 10: Bits [223:192] of the source are copied to bits [255:224] of the +/// returned vector. \n +/// 11: Bits [255:224] of the source are copied to bits [255:224] of the +/// returned vector. +/// \returns A 256-bit vector of [8 x float] containing the copied values. +static __inline __m256 __DEFAULT_FN_ATTRS +_mm256_permutevar_ps(__m256 __a, __m256i __c) +{ + return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c); +} + +/// Copies the values in a 128-bit vector of [2 x double] as specified +/// by the immediate integer operand. +/// +/// \headerfile +/// +/// \code +/// __m128d _mm_permute_pd(__m128d A, const int C); +/// \endcode +/// +/// This intrinsic corresponds to the VPERMILPD instruction. +/// +/// \param A +/// A 128-bit vector of [2 x double]. +/// \param C +/// An immediate integer operand specifying how the values are to be +/// copied. \n +/// Bit [0]: \n +/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned +/// vector. \n +/// 1: Bits [127:64] of the source are copied to bits [63:0] of the +/// returned vector. \n +/// Bit [1]: \n +/// 0: Bits [63:0] of the source are copied to bits [127:64] of the +/// returned vector. \n +/// 1: Bits [127:64] of the source are copied to bits [127:64] of the +/// returned vector. +/// \returns A 128-bit vector of [2 x double] containing the copied values. +#define _mm_permute_pd(A, C) \ + ((__m128d)__builtin_ia32_vpermilpd((__v2df)(__m128d)(A), (int)(C))) + +/// Copies the values in a 256-bit vector of [4 x double] as specified by +/// the immediate integer operand. +/// +/// \headerfile +/// +/// \code +/// __m256d _mm256_permute_pd(__m256d A, const int C); +/// \endcode +/// +/// This intrinsic corresponds to the VPERMILPD instruction. +/// +/// \param A +/// A 256-bit vector of [4 x double]. +/// \param C +/// An immediate integer operand specifying how the values are to be +/// copied. \n +/// Bit [0]: \n +/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned +/// vector. \n +/// 1: Bits [127:64] of the source are copied to bits [63:0] of the +/// returned vector. \n +/// Bit [1]: \n +/// 0: Bits [63:0] of the source are copied to bits [127:64] of the +/// returned vector. \n +/// 1: Bits [127:64] of the source are copied to bits [127:64] of the +/// returned vector. \n +/// Bit [2]: \n +/// 0: Bits [191:128] of the source are copied to bits [191:128] of the +/// returned vector. \n +/// 1: Bits [255:192] of the source are copied to bits [191:128] of the +/// returned vector. \n +/// Bit [3]: \n +/// 0: Bits [191:128] of the source are copied to bits [255:192] of the +/// returned vector. \n +/// 1: Bits [255:192] of the source are copied to bits [255:192] of the +/// returned vector. +/// \returns A 256-bit vector of [4 x double] containing the copied values. +#define _mm256_permute_pd(A, C) \ + ((__m256d)__builtin_ia32_vpermilpd256((__v4df)(__m256d)(A), (int)(C))) + +/// Copies the values in a 128-bit vector of [4 x float] as specified by +/// the immediate integer operand. +/// +/// \headerfile +/// +/// \code +/// __m128 _mm_permute_ps(__m128 A, const int C); +/// \endcode +/// +/// This intrinsic corresponds to the VPERMILPS instruction. +/// +/// \param A +/// A 128-bit vector of [4 x float]. +/// \param C +/// An immediate integer operand specifying how the values are to be +/// copied. \n +/// Bits [1:0]: \n +/// 00: Bits [31:0] of the source are copied to bits [31:0] of the +/// returned vector. \n +/// 01: Bits [63:32] of the source are copied to bits [31:0] of the +/// returned vector. \n +/// 10: Bits [95:64] of the source are copied to bits [31:0] of the +/// returned vector. \n +/// 11: Bits [127:96] of the source are copied to bits [31:0] of the +/// returned vector. \n +/// Bits [3:2]: \n +/// 00: Bits [31:0] of the source are copied to bits [63:32] of the +/// returned vector. \n +/// 01: Bits [63:32] of the source are copied to bits [63:32] of the +/// returned vector. \n +/// 10: Bits [95:64] of the source are copied to bits [63:32] of the +/// returned vector. \n +/// 11: Bits [127:96] of the source are copied to bits [63:32] of the +/// returned vector. \n +/// Bits [5:4]: \n +/// 00: Bits [31:0] of the source are copied to bits [95:64] of the +/// returned vector. \n +/// 01: Bits [63:32] of the source are copied to bits [95:64] of the +/// returned vector. \n +/// 10: Bits [95:64] of the source are copied to bits [95:64] of the +/// returned vector. \n +/// 11: Bits [127:96] of the source are copied to bits [95:64] of the +/// returned vector. \n +/// Bits [7:6]: \n +/// 00: Bits [31:0] of the source are copied to bits [127:96] of the +/// returned vector. \n +/// 01: Bits [63:32] of the source are copied to bits [127:96] of the +/// returned vector. \n +/// 10: Bits [95:64] of the source are copied to bits [127:96] of the +/// returned vector. \n +/// 11: Bits [127:96] of the source are copied to bits [127:96] of the +/// returned vector. +/// \returns A 128-bit vector of [4 x float] containing the copied values. +#define _mm_permute_ps(A, C) \ + ((__m128)__builtin_ia32_vpermilps((__v4sf)(__m128)(A), (int)(C))) + +/// Copies the values in a 256-bit vector of [8 x float] as specified by +/// the immediate integer operand. +/// +/// \headerfile +/// +/// \code +/// __m256 _mm256_permute_ps(__m256 A, const int C); +/// \endcode +/// +/// This intrinsic corresponds to the VPERMILPS instruction. +/// +/// \param A +/// A 256-bit vector of [8 x float]. +/// \param C +/// An immediate integer operand specifying how the values are to be +/// copied. \n +/// Bits [1:0]: \n +/// 00: Bits [31:0] of the source are copied to bits [31:0] of the +/// returned vector. \n +/// 01: Bits [63:32] of the source are copied to bits [31:0] of the +/// returned vector. \n +/// 10: Bits [95:64] of the source are copied to bits [31:0] of the +/// returned vector. \n +/// 11: Bits [127:96] of the source are copied to bits [31:0] of the +/// returned vector. \n +/// Bits [3:2]: \n +/// 00: Bits [31:0] of the source are copied to bits [63:32] of the +/// returned vector. \n +/// 01: Bits [63:32] of the source are copied to bits [63:32] of the +/// returned vector. \n +/// 10: Bits [95:64] of the source are copied to bits [63:32] of the +/// returned vector. \n +/// 11: Bits [127:96] of the source are copied to bits [63:32] of the +/// returned vector. \n +/// Bits [5:4]: \n +/// 00: Bits [31:0] of the source are copied to bits [95:64] of the +/// returned vector. \n +/// 01: Bits [63:32] of the source are copied to bits [95:64] of the +/// returned vector. \n +/// 10: Bits [95:64] of the source are copied to bits [95:64] of the +/// returned vector. \n +/// 11: Bits [127:96] of the source are copied to bits [95:64] of the +/// returned vector. \n +/// Bits [7:6]: \n +/// 00: Bits [31:0] of the source are copied to bits [127:96] of the +/// returned vector. \n +/// 01: Bits [63:32] of the source are copied to bits [127:96] of the +/// returned vector. \n +/// 10: Bits [95:64] of the source are copied to bits [127:96] of the +/// returned vector. \n +/// 11: Bits [127:96] of the source are copied to bits [127:96] of the +/// returned vector. \n +/// Bits [1:0]: \n +/// 00: Bits [159:128] of the source are copied to bits [159:128] of the +/// returned vector. \n +/// 01: Bits [191:160] of the source are copied to bits [159:128] of the +/// returned vector. \n +/// 10: Bits [223:192] of the source are copied to bits [159:128] of the +/// returned vector. \n +/// 11: Bits [255:224] of the source are copied to bits [159:128] of the +/// returned vector. \n +/// Bits [3:2]: \n +/// 00: Bits [159:128] of the source are copied to bits [191:160] of the +/// returned vector. \n +/// 01: Bits [191:160] of the source are copied to bits [191:160] of the +/// returned vector. \n +/// 10: Bits [223:192] of the source are copied to bits [191:160] of the +/// returned vector. \n +/// 11: Bits [255:224] of the source are copied to bits [191:160] of the +/// returned vector. \n +/// Bits [5:4]: \n +/// 00: Bits [159:128] of the source are copied to bits [223:192] of the +/// returned vector. \n +/// 01: Bits [191:160] of the source are copied to bits [223:192] of the +/// returned vector. \n +/// 10: Bits [223:192] of the source are copied to bits [223:192] of the +/// returned vector. \n +/// 11: Bits [255:224] of the source are copied to bits [223:192] of the +/// returned vector. \n +/// Bits [7:6]: \n +/// 00: Bits [159:128] of the source are copied to bits [255:224] of the +/// returned vector. \n +/// 01: Bits [191:160] of the source are copied to bits [255:224] of the +/// returned vector. \n +/// 10: Bits [223:192] of the source are copied to bits [255:224] of the +/// returned vector. \n +/// 11: Bits [255:224] of the source are copied to bits [255:224] of the +/// returned vector. +/// \returns A 256-bit vector of [8 x float] containing the copied values. +#define _mm256_permute_ps(A, C) \ + ((__m256)__builtin_ia32_vpermilps256((__v8sf)(__m256)(A), (int)(C))) + +/// Permutes 128-bit data values stored in two 256-bit vectors of +/// [4 x double], as specified by the immediate integer operand. +/// +/// \headerfile +/// +/// \code +/// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the VPERM2F128 instruction. +/// +/// \param V1 +/// A 256-bit vector of [4 x double]. +/// \param V2 +/// A 256-bit vector of [4 x double. +/// \param M +/// An immediate integer operand specifying how the values are to be +/// permuted. \n +/// Bits [1:0]: \n +/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the +/// destination. \n +/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the +/// destination. \n +/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the +/// destination. \n +/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the +/// destination. \n +/// Bits [5:4]: \n +/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the +/// destination. \n +/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the +/// destination. \n +/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the +/// destination. \n +/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the +/// destination. +/// \returns A 256-bit vector of [4 x double] containing the copied values. +#define _mm256_permute2f128_pd(V1, V2, M) \ + ((__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \ + (__v4df)(__m256d)(V2), (int)(M))) + +/// Permutes 128-bit data values stored in two 256-bit vectors of +/// [8 x float], as specified by the immediate integer operand. +/// +/// \headerfile +/// +/// \code +/// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the VPERM2F128 instruction. +/// +/// \param V1 +/// A 256-bit vector of [8 x float]. +/// \param V2 +/// A 256-bit vector of [8 x float]. +/// \param M +/// An immediate integer operand specifying how the values are to be +/// permuted. \n +/// Bits [1:0]: \n +/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the +/// destination. \n +/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the +/// destination. \n +/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the +/// destination. \n +/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the +/// destination. \n +/// Bits [5:4]: \n +/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the +/// destination. \n +/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the +/// destination. \n +/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the +/// destination. \n +/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the +/// destination. +/// \returns A 256-bit vector of [8 x float] containing the copied values. +#define _mm256_permute2f128_ps(V1, V2, M) \ + ((__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \ + (__v8sf)(__m256)(V2), (int)(M))) + +/// Permutes 128-bit data values stored in two 256-bit integer vectors, +/// as specified by the immediate integer operand. +/// +/// \headerfile +/// +/// \code +/// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the VPERM2F128 instruction. +/// +/// \param V1 +/// A 256-bit integer vector. +/// \param V2 +/// A 256-bit integer vector. +/// \param M +/// An immediate integer operand specifying how the values are to be copied. +/// Bits [1:0]: \n +/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the +/// destination. \n +/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the +/// destination. \n +/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the +/// destination. \n +/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the +/// destination. \n +/// Bits [5:4]: \n +/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the +/// destination. \n +/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the +/// destination. \n +/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the +/// destination. \n +/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the +/// destination. +/// \returns A 256-bit integer vector containing the copied values. +#define _mm256_permute2f128_si256(V1, V2, M) \ + ((__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \ + (__v8si)(__m256i)(V2), (int)(M))) + +/* Vector Blend */ +/// Merges 64-bit double-precision data values stored in either of the +/// two 256-bit vectors of [4 x double], as specified by the immediate +/// integer operand. +/// +/// \headerfile +/// +/// \code +/// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the VBLENDPD instruction. +/// +/// \param V1 +/// A 256-bit vector of [4 x double]. +/// \param V2 +/// A 256-bit vector of [4 x double]. +/// \param M +/// An immediate integer operand, with mask bits [3:0] specifying how the +/// values are to be copied. The position of the mask bit corresponds to the +/// index of a copied value. When a mask bit is 0, the corresponding 64-bit +/// element in operand \a V1 is copied to the same position in the +/// destination. When a mask bit is 1, the corresponding 64-bit element in +/// operand \a V2 is copied to the same position in the destination. +/// \returns A 256-bit vector of [4 x double] containing the copied values. +#define _mm256_blend_pd(V1, V2, M) \ + ((__m256d)__builtin_ia32_blendpd256((__v4df)(__m256d)(V1), \ + (__v4df)(__m256d)(V2), (int)(M))) + +/// Merges 32-bit single-precision data values stored in either of the +/// two 256-bit vectors of [8 x float], as specified by the immediate +/// integer operand. +/// +/// \headerfile +/// +/// \code +/// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the VBLENDPS instruction. +/// +/// \param V1 +/// A 256-bit vector of [8 x float]. +/// \param V2 +/// A 256-bit vector of [8 x float]. +/// \param M +/// An immediate integer operand, with mask bits [7:0] specifying how the +/// values are to be copied. The position of the mask bit corresponds to the +/// index of a copied value. When a mask bit is 0, the corresponding 32-bit +/// element in operand \a V1 is copied to the same position in the +/// destination. When a mask bit is 1, the corresponding 32-bit element in +/// operand \a V2 is copied to the same position in the destination. +/// \returns A 256-bit vector of [8 x float] containing the copied values. +#define _mm256_blend_ps(V1, V2, M) \ + ((__m256)__builtin_ia32_blendps256((__v8sf)(__m256)(V1), \ + (__v8sf)(__m256)(V2), (int)(M))) + +/// Merges 64-bit double-precision data values stored in either of the +/// two 256-bit vectors of [4 x double], as specified by the 256-bit vector +/// operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VBLENDVPD instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double]. +/// \param __b +/// A 256-bit vector of [4 x double]. +/// \param __c +/// A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying +/// how the values are to be copied. The position of the mask bit corresponds +/// to the most significant bit of a copied value. When a mask bit is 0, the +/// corresponding 64-bit element in operand \a __a is copied to the same +/// position in the destination. When a mask bit is 1, the corresponding +/// 64-bit element in operand \a __b is copied to the same position in the +/// destination. +/// \returns A 256-bit vector of [4 x double] containing the copied values. +static __inline __m256d __DEFAULT_FN_ATTRS +_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c) +{ + return (__m256d)__builtin_ia32_blendvpd256( + (__v4df)__a, (__v4df)__b, (__v4df)__c); +} + +/// Merges 32-bit single-precision data values stored in either of the +/// two 256-bit vectors of [8 x float], as specified by the 256-bit vector +/// operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VBLENDVPS instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float]. +/// \param __b +/// A 256-bit vector of [8 x float]. +/// \param __c +/// A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63, +/// and 31 specifying how the values are to be copied. The position of the +/// mask bit corresponds to the most significant bit of a copied value. When +/// a mask bit is 0, the corresponding 32-bit element in operand \a __a is +/// copied to the same position in the destination. When a mask bit is 1, the +/// corresponding 32-bit element in operand \a __b is copied to the same +/// position in the destination. +/// \returns A 256-bit vector of [8 x float] containing the copied values. +static __inline __m256 __DEFAULT_FN_ATTRS +_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) +{ + return (__m256)__builtin_ia32_blendvps256( + (__v8sf)__a, (__v8sf)__b, (__v8sf)__c); +} + +/* Vector Dot Product */ +/// Computes two dot products in parallel, using the lower and upper +/// halves of two [8 x float] vectors as input to the two computations, and +/// returning the two dot products in the lower and upper halves of the +/// [8 x float] result. +/// +/// The immediate integer operand controls which input elements will +/// contribute to the dot product, and where the final results are returned. +/// In general, for each dot product, the four corresponding elements of the +/// input vectors are multiplied; the first two and second two products are +/// summed, then the two sums are added to form the final result. +/// +/// \headerfile +/// +/// \code +/// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the VDPPS instruction. +/// +/// \param V1 +/// A vector of [8 x float] values, treated as two [4 x float] vectors. +/// \param V2 +/// A vector of [8 x float] values, treated as two [4 x float] vectors. +/// \param M +/// An immediate integer argument. Bits [7:4] determine which elements of +/// the input vectors are used, with bit [4] corresponding to the lowest +/// element and bit [7] corresponding to the highest element of each [4 x +/// float] subvector. If a bit is set, the corresponding elements from the +/// two input vectors are used as an input for dot product; otherwise that +/// input is treated as zero. Bits [3:0] determine which elements of the +/// result will receive a copy of the final dot product, with bit [0] +/// corresponding to the lowest element and bit [3] corresponding to the +/// highest element of each [4 x float] subvector. If a bit is set, the dot +/// product is returned in the corresponding element; otherwise that element +/// is set to zero. The bitmask is applied in the same way to each of the +/// two parallel dot product computations. +/// \returns A 256-bit vector of [8 x float] containing the two dot products. +#define _mm256_dp_ps(V1, V2, M) \ + ((__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \ + (__v8sf)(__m256)(V2), (M))) + +/* Vector shuffle */ +/// Selects 8 float values from the 256-bit operands of [8 x float], as +/// specified by the immediate value operand. +/// +/// The four selected elements in each operand are copied to the destination +/// according to the bits specified in the immediate operand. The selected +/// elements from the first 256-bit operand are copied to bits [63:0] and +/// bits [191:128] of the destination, and the selected elements from the +/// second 256-bit operand are copied to bits [127:64] and bits [255:192] of +/// the destination. For example, if bits [7:0] of the immediate operand +/// contain a value of 0xFF, the 256-bit destination vector would contain the +/// following values: b[7], b[7], a[7], a[7], b[3], b[3], a[3], a[3]. +/// +/// \headerfile +/// +/// \code +/// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask); +/// \endcode +/// +/// This intrinsic corresponds to the VSHUFPS instruction. +/// +/// \param a +/// A 256-bit vector of [8 x float]. The four selected elements in this +/// operand are copied to bits [63:0] and bits [191:128] in the destination, +/// according to the bits specified in the immediate operand. +/// \param b +/// A 256-bit vector of [8 x float]. The four selected elements in this +/// operand are copied to bits [127:64] and bits [255:192] in the +/// destination, according to the bits specified in the immediate operand. +/// \param mask +/// An immediate value containing an 8-bit value specifying which elements to +/// copy from \a a and \a b \n. +/// Bits [3:0] specify the values copied from operand \a a. \n +/// Bits [7:4] specify the values copied from operand \a b. \n +/// The destinations within the 256-bit destination are assigned values as +/// follows, according to the bit value assignments described below: \n +/// Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the +/// destination. \n +/// Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the +/// destination. \n +/// Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the +/// destination. \n +/// Bits [7:6] are used to assign values to bits [127:96] and [255:224] in +/// the destination. \n +/// Bit value assignments: \n +/// 00: Bits [31:0] and [159:128] are copied from the selected operand. \n +/// 01: Bits [63:32] and [191:160] are copied from the selected operand. \n +/// 10: Bits [95:64] and [223:192] are copied from the selected operand. \n +/// 11: Bits [127:96] and [255:224] are copied from the selected operand. +/// \returns A 256-bit vector of [8 x float] containing the shuffled values. +#define _mm256_shuffle_ps(a, b, mask) \ + ((__m256)__builtin_ia32_shufps256((__v8sf)(__m256)(a), \ + (__v8sf)(__m256)(b), (int)(mask))) + +/// Selects four double-precision values from the 256-bit operands of +/// [4 x double], as specified by the immediate value operand. +/// +/// The selected elements from the first 256-bit operand are copied to bits +/// [63:0] and bits [191:128] in the destination, and the selected elements +/// from the second 256-bit operand are copied to bits [127:64] and bits +/// [255:192] in the destination. For example, if bits [3:0] of the immediate +/// operand contain a value of 0xF, the 256-bit destination vector would +/// contain the following values: b[3], a[3], b[1], a[1]. +/// +/// \headerfile +/// +/// \code +/// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask); +/// \endcode +/// +/// This intrinsic corresponds to the VSHUFPD instruction. +/// +/// \param a +/// A 256-bit vector of [4 x double]. +/// \param b +/// A 256-bit vector of [4 x double]. +/// \param mask +/// An immediate value containing 8-bit values specifying which elements to +/// copy from \a a and \a b: \n +/// Bit [0]=0: Bits [63:0] are copied from \a a to bits [63:0] of the +/// destination. \n +/// Bit [0]=1: Bits [127:64] are copied from \a a to bits [63:0] of the +/// destination. \n +/// Bit [1]=0: Bits [63:0] are copied from \a b to bits [127:64] of the +/// destination. \n +/// Bit [1]=1: Bits [127:64] are copied from \a b to bits [127:64] of the +/// destination. \n +/// Bit [2]=0: Bits [191:128] are copied from \a a to bits [191:128] of the +/// destination. \n +/// Bit [2]=1: Bits [255:192] are copied from \a a to bits [191:128] of the +/// destination. \n +/// Bit [3]=0: Bits [191:128] are copied from \a b to bits [255:192] of the +/// destination. \n +/// Bit [3]=1: Bits [255:192] are copied from \a b to bits [255:192] of the +/// destination. +/// \returns A 256-bit vector of [4 x double] containing the shuffled values. +#define _mm256_shuffle_pd(a, b, mask) \ + ((__m256d)__builtin_ia32_shufpd256((__v4df)(__m256d)(a), \ + (__v4df)(__m256d)(b), (int)(mask))) + +/* Compare */ +#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */ +#define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */ +#define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */ +#define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */ +#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */ +#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */ +#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */ +#define _CMP_ORD_Q 0x07 /* Ordered (non-signaling) */ +#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */ +#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unordered, signaling) */ +#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */ +#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */ +#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */ +#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */ +#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */ +#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */ +#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */ +#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */ +#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */ +#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */ +#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */ +#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */ +#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unordered, non-signaling) */ +#define _CMP_ORD_S 0x17 /* Ordered (signaling) */ +#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */ +#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unordered, non-signaling) */ +#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */ +#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */ +#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */ +#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */ +#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */ +#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */ + +/// Compares each of the corresponding double-precision values of two +/// 128-bit vectors of [2 x double], using the operation specified by the +/// immediate integer operand. +/// +/// Returns a [2 x double] vector consisting of two doubles corresponding to +/// the two comparison results: zero if the comparison is false, and all 1's +/// if the comparison is true. +/// +/// \headerfile +/// +/// \code +/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c); +/// \endcode +/// +/// This intrinsic corresponds to the VCMPPD instruction. +/// +/// \param a +/// A 128-bit vector of [2 x double]. +/// \param b +/// A 128-bit vector of [2 x double]. +/// \param c +/// An immediate integer operand, with bits [4:0] specifying which comparison +/// operation to use: \n +/// 0x00: Equal (ordered, non-signaling) \n +/// 0x01: Less-than (ordered, signaling) \n +/// 0x02: Less-than-or-equal (ordered, signaling) \n +/// 0x03: Unordered (non-signaling) \n +/// 0x04: Not-equal (unordered, non-signaling) \n +/// 0x05: Not-less-than (unordered, signaling) \n +/// 0x06: Not-less-than-or-equal (unordered, signaling) \n +/// 0x07: Ordered (non-signaling) \n +/// 0x08: Equal (unordered, non-signaling) \n +/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n +/// 0x0A: Not-greater-than (unordered, signaling) \n +/// 0x0B: False (ordered, non-signaling) \n +/// 0x0C: Not-equal (ordered, non-signaling) \n +/// 0x0D: Greater-than-or-equal (ordered, signaling) \n +/// 0x0E: Greater-than (ordered, signaling) \n +/// 0x0F: True (unordered, non-signaling) \n +/// 0x10: Equal (ordered, signaling) \n +/// 0x11: Less-than (ordered, non-signaling) \n +/// 0x12: Less-than-or-equal (ordered, non-signaling) \n +/// 0x13: Unordered (signaling) \n +/// 0x14: Not-equal (unordered, signaling) \n +/// 0x15: Not-less-than (unordered, non-signaling) \n +/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n +/// 0x17: Ordered (signaling) \n +/// 0x18: Equal (unordered, signaling) \n +/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n +/// 0x1A: Not-greater-than (unordered, non-signaling) \n +/// 0x1B: False (ordered, signaling) \n +/// 0x1C: Not-equal (ordered, signaling) \n +/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n +/// 0x1E: Greater-than (ordered, non-signaling) \n +/// 0x1F: True (unordered, signaling) +/// \returns A 128-bit vector of [2 x double] containing the comparison results. +#define _mm_cmp_pd(a, b, c) \ + ((__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \ + (__v2df)(__m128d)(b), (c))) + +/// Compares each of the corresponding values of two 128-bit vectors of +/// [4 x float], using the operation specified by the immediate integer +/// operand. +/// +/// Returns a [4 x float] vector consisting of four floats corresponding to +/// the four comparison results: zero if the comparison is false, and all 1's +/// if the comparison is true. +/// +/// \headerfile +/// +/// \code +/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c); +/// \endcode +/// +/// This intrinsic corresponds to the VCMPPS instruction. +/// +/// \param a +/// A 128-bit vector of [4 x float]. +/// \param b +/// A 128-bit vector of [4 x float]. +/// \param c +/// An immediate integer operand, with bits [4:0] specifying which comparison +/// operation to use: \n +/// 0x00: Equal (ordered, non-signaling) \n +/// 0x01: Less-than (ordered, signaling) \n +/// 0x02: Less-than-or-equal (ordered, signaling) \n +/// 0x03: Unordered (non-signaling) \n +/// 0x04: Not-equal (unordered, non-signaling) \n +/// 0x05: Not-less-than (unordered, signaling) \n +/// 0x06: Not-less-than-or-equal (unordered, signaling) \n +/// 0x07: Ordered (non-signaling) \n +/// 0x08: Equal (unordered, non-signaling) \n +/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n +/// 0x0A: Not-greater-than (unordered, signaling) \n +/// 0x0B: False (ordered, non-signaling) \n +/// 0x0C: Not-equal (ordered, non-signaling) \n +/// 0x0D: Greater-than-or-equal (ordered, signaling) \n +/// 0x0E: Greater-than (ordered, signaling) \n +/// 0x0F: True (unordered, non-signaling) \n +/// 0x10: Equal (ordered, signaling) \n +/// 0x11: Less-than (ordered, non-signaling) \n +/// 0x12: Less-than-or-equal (ordered, non-signaling) \n +/// 0x13: Unordered (signaling) \n +/// 0x14: Not-equal (unordered, signaling) \n +/// 0x15: Not-less-than (unordered, non-signaling) \n +/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n +/// 0x17: Ordered (signaling) \n +/// 0x18: Equal (unordered, signaling) \n +/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n +/// 0x1A: Not-greater-than (unordered, non-signaling) \n +/// 0x1B: False (ordered, signaling) \n +/// 0x1C: Not-equal (ordered, signaling) \n +/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n +/// 0x1E: Greater-than (ordered, non-signaling) \n +/// 0x1F: True (unordered, signaling) +/// \returns A 128-bit vector of [4 x float] containing the comparison results. +#define _mm_cmp_ps(a, b, c) \ + ((__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \ + (__v4sf)(__m128)(b), (c))) + +/// Compares each of the corresponding double-precision values of two +/// 256-bit vectors of [4 x double], using the operation specified by the +/// immediate integer operand. +/// +/// Returns a [4 x double] vector consisting of four doubles corresponding to +/// the four comparison results: zero if the comparison is false, and all 1's +/// if the comparison is true. +/// +/// \headerfile +/// +/// \code +/// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c); +/// \endcode +/// +/// This intrinsic corresponds to the VCMPPD instruction. +/// +/// \param a +/// A 256-bit vector of [4 x double]. +/// \param b +/// A 256-bit vector of [4 x double]. +/// \param c +/// An immediate integer operand, with bits [4:0] specifying which comparison +/// operation to use: \n +/// 0x00: Equal (ordered, non-signaling) \n +/// 0x01: Less-than (ordered, signaling) \n +/// 0x02: Less-than-or-equal (ordered, signaling) \n +/// 0x03: Unordered (non-signaling) \n +/// 0x04: Not-equal (unordered, non-signaling) \n +/// 0x05: Not-less-than (unordered, signaling) \n +/// 0x06: Not-less-than-or-equal (unordered, signaling) \n +/// 0x07: Ordered (non-signaling) \n +/// 0x08: Equal (unordered, non-signaling) \n +/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n +/// 0x0A: Not-greater-than (unordered, signaling) \n +/// 0x0B: False (ordered, non-signaling) \n +/// 0x0C: Not-equal (ordered, non-signaling) \n +/// 0x0D: Greater-than-or-equal (ordered, signaling) \n +/// 0x0E: Greater-than (ordered, signaling) \n +/// 0x0F: True (unordered, non-signaling) \n +/// 0x10: Equal (ordered, signaling) \n +/// 0x11: Less-than (ordered, non-signaling) \n +/// 0x12: Less-than-or-equal (ordered, non-signaling) \n +/// 0x13: Unordered (signaling) \n +/// 0x14: Not-equal (unordered, signaling) \n +/// 0x15: Not-less-than (unordered, non-signaling) \n +/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n +/// 0x17: Ordered (signaling) \n +/// 0x18: Equal (unordered, signaling) \n +/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n +/// 0x1A: Not-greater-than (unordered, non-signaling) \n +/// 0x1B: False (ordered, signaling) \n +/// 0x1C: Not-equal (ordered, signaling) \n +/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n +/// 0x1E: Greater-than (ordered, non-signaling) \n +/// 0x1F: True (unordered, signaling) +/// \returns A 256-bit vector of [4 x double] containing the comparison results. +#define _mm256_cmp_pd(a, b, c) \ + ((__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \ + (__v4df)(__m256d)(b), (c))) + +/// Compares each of the corresponding values of two 256-bit vectors of +/// [8 x float], using the operation specified by the immediate integer +/// operand. +/// +/// Returns a [8 x float] vector consisting of eight floats corresponding to +/// the eight comparison results: zero if the comparison is false, and all +/// 1's if the comparison is true. +/// +/// \headerfile +/// +/// \code +/// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c); +/// \endcode +/// +/// This intrinsic corresponds to the VCMPPS instruction. +/// +/// \param a +/// A 256-bit vector of [8 x float]. +/// \param b +/// A 256-bit vector of [8 x float]. +/// \param c +/// An immediate integer operand, with bits [4:0] specifying which comparison +/// operation to use: \n +/// 0x00: Equal (ordered, non-signaling) \n +/// 0x01: Less-than (ordered, signaling) \n +/// 0x02: Less-than-or-equal (ordered, signaling) \n +/// 0x03: Unordered (non-signaling) \n +/// 0x04: Not-equal (unordered, non-signaling) \n +/// 0x05: Not-less-than (unordered, signaling) \n +/// 0x06: Not-less-than-or-equal (unordered, signaling) \n +/// 0x07: Ordered (non-signaling) \n +/// 0x08: Equal (unordered, non-signaling) \n +/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n +/// 0x0A: Not-greater-than (unordered, signaling) \n +/// 0x0B: False (ordered, non-signaling) \n +/// 0x0C: Not-equal (ordered, non-signaling) \n +/// 0x0D: Greater-than-or-equal (ordered, signaling) \n +/// 0x0E: Greater-than (ordered, signaling) \n +/// 0x0F: True (unordered, non-signaling) \n +/// 0x10: Equal (ordered, signaling) \n +/// 0x11: Less-than (ordered, non-signaling) \n +/// 0x12: Less-than-or-equal (ordered, non-signaling) \n +/// 0x13: Unordered (signaling) \n +/// 0x14: Not-equal (unordered, signaling) \n +/// 0x15: Not-less-than (unordered, non-signaling) \n +/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n +/// 0x17: Ordered (signaling) \n +/// 0x18: Equal (unordered, signaling) \n +/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n +/// 0x1A: Not-greater-than (unordered, non-signaling) \n +/// 0x1B: False (ordered, signaling) \n +/// 0x1C: Not-equal (ordered, signaling) \n +/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n +/// 0x1E: Greater-than (ordered, non-signaling) \n +/// 0x1F: True (unordered, signaling) +/// \returns A 256-bit vector of [8 x float] containing the comparison results. +#define _mm256_cmp_ps(a, b, c) \ + ((__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \ + (__v8sf)(__m256)(b), (c))) + +/// Compares each of the corresponding scalar double-precision values of +/// two 128-bit vectors of [2 x double], using the operation specified by the +/// immediate integer operand. +/// +/// If the result is true, all 64 bits of the destination vector are set; +/// otherwise they are cleared. +/// +/// \headerfile +/// +/// \code +/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c); +/// \endcode +/// +/// This intrinsic corresponds to the VCMPSD instruction. +/// +/// \param a +/// A 128-bit vector of [2 x double]. +/// \param b +/// A 128-bit vector of [2 x double]. +/// \param c +/// An immediate integer operand, with bits [4:0] specifying which comparison +/// operation to use: \n +/// 0x00: Equal (ordered, non-signaling) \n +/// 0x01: Less-than (ordered, signaling) \n +/// 0x02: Less-than-or-equal (ordered, signaling) \n +/// 0x03: Unordered (non-signaling) \n +/// 0x04: Not-equal (unordered, non-signaling) \n +/// 0x05: Not-less-than (unordered, signaling) \n +/// 0x06: Not-less-than-or-equal (unordered, signaling) \n +/// 0x07: Ordered (non-signaling) \n +/// 0x08: Equal (unordered, non-signaling) \n +/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n +/// 0x0A: Not-greater-than (unordered, signaling) \n +/// 0x0B: False (ordered, non-signaling) \n +/// 0x0C: Not-equal (ordered, non-signaling) \n +/// 0x0D: Greater-than-or-equal (ordered, signaling) \n +/// 0x0E: Greater-than (ordered, signaling) \n +/// 0x0F: True (unordered, non-signaling) \n +/// 0x10: Equal (ordered, signaling) \n +/// 0x11: Less-than (ordered, non-signaling) \n +/// 0x12: Less-than-or-equal (ordered, non-signaling) \n +/// 0x13: Unordered (signaling) \n +/// 0x14: Not-equal (unordered, signaling) \n +/// 0x15: Not-less-than (unordered, non-signaling) \n +/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n +/// 0x17: Ordered (signaling) \n +/// 0x18: Equal (unordered, signaling) \n +/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n +/// 0x1A: Not-greater-than (unordered, non-signaling) \n +/// 0x1B: False (ordered, signaling) \n +/// 0x1C: Not-equal (ordered, signaling) \n +/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n +/// 0x1E: Greater-than (ordered, non-signaling) \n +/// 0x1F: True (unordered, signaling) +/// \returns A 128-bit vector of [2 x double] containing the comparison results. +#define _mm_cmp_sd(a, b, c) \ + ((__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \ + (__v2df)(__m128d)(b), (c))) + +/// Compares each of the corresponding scalar values of two 128-bit +/// vectors of [4 x float], using the operation specified by the immediate +/// integer operand. +/// +/// If the result is true, all 32 bits of the destination vector are set; +/// otherwise they are cleared. +/// +/// \headerfile +/// +/// \code +/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c); +/// \endcode +/// +/// This intrinsic corresponds to the VCMPSS instruction. +/// +/// \param a +/// A 128-bit vector of [4 x float]. +/// \param b +/// A 128-bit vector of [4 x float]. +/// \param c +/// An immediate integer operand, with bits [4:0] specifying which comparison +/// operation to use: \n +/// 0x00: Equal (ordered, non-signaling) \n +/// 0x01: Less-than (ordered, signaling) \n +/// 0x02: Less-than-or-equal (ordered, signaling) \n +/// 0x03: Unordered (non-signaling) \n +/// 0x04: Not-equal (unordered, non-signaling) \n +/// 0x05: Not-less-than (unordered, signaling) \n +/// 0x06: Not-less-than-or-equal (unordered, signaling) \n +/// 0x07: Ordered (non-signaling) \n +/// 0x08: Equal (unordered, non-signaling) \n +/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n +/// 0x0A: Not-greater-than (unordered, signaling) \n +/// 0x0B: False (ordered, non-signaling) \n +/// 0x0C: Not-equal (ordered, non-signaling) \n +/// 0x0D: Greater-than-or-equal (ordered, signaling) \n +/// 0x0E: Greater-than (ordered, signaling) \n +/// 0x0F: True (unordered, non-signaling) \n +/// 0x10: Equal (ordered, signaling) \n +/// 0x11: Less-than (ordered, non-signaling) \n +/// 0x12: Less-than-or-equal (ordered, non-signaling) \n +/// 0x13: Unordered (signaling) \n +/// 0x14: Not-equal (unordered, signaling) \n +/// 0x15: Not-less-than (unordered, non-signaling) \n +/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n +/// 0x17: Ordered (signaling) \n +/// 0x18: Equal (unordered, signaling) \n +/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n +/// 0x1A: Not-greater-than (unordered, non-signaling) \n +/// 0x1B: False (ordered, signaling) \n +/// 0x1C: Not-equal (ordered, signaling) \n +/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n +/// 0x1E: Greater-than (ordered, non-signaling) \n +/// 0x1F: True (unordered, signaling) +/// \returns A 128-bit vector of [4 x float] containing the comparison results. +#define _mm_cmp_ss(a, b, c) \ + ((__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \ + (__v4sf)(__m128)(b), (c))) + +/// Takes a [8 x i32] vector and returns the vector element value +/// indexed by the immediate constant operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VEXTRACTF128+COMPOSITE +/// instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x i32]. +/// \param __imm +/// An immediate integer operand with bits [2:0] determining which vector +/// element is extracted and returned. +/// \returns A 32-bit integer containing the extracted 32 bits of extended +/// packed data. +#define _mm256_extract_epi32(X, N) \ + ((int)__builtin_ia32_vec_ext_v8si((__v8si)(__m256i)(X), (int)(N))) + +/// Takes a [16 x i16] vector and returns the vector element value +/// indexed by the immediate constant operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VEXTRACTF128+COMPOSITE +/// instruction. +/// +/// \param __a +/// A 256-bit integer vector of [16 x i16]. +/// \param __imm +/// An immediate integer operand with bits [3:0] determining which vector +/// element is extracted and returned. +/// \returns A 32-bit integer containing the extracted 16 bits of zero extended +/// packed data. +#define _mm256_extract_epi16(X, N) \ + ((int)(unsigned short)__builtin_ia32_vec_ext_v16hi((__v16hi)(__m256i)(X), \ + (int)(N))) + +/// Takes a [32 x i8] vector and returns the vector element value +/// indexed by the immediate constant operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VEXTRACTF128+COMPOSITE +/// instruction. +/// +/// \param __a +/// A 256-bit integer vector of [32 x i8]. +/// \param __imm +/// An immediate integer operand with bits [4:0] determining which vector +/// element is extracted and returned. +/// \returns A 32-bit integer containing the extracted 8 bits of zero extended +/// packed data. +#define _mm256_extract_epi8(X, N) \ + ((int)(unsigned char)__builtin_ia32_vec_ext_v32qi((__v32qi)(__m256i)(X), \ + (int)(N))) + +#ifdef __x86_64__ +/// Takes a [4 x i64] vector and returns the vector element value +/// indexed by the immediate constant operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VEXTRACTF128+COMPOSITE +/// instruction. +/// +/// \param __a +/// A 256-bit integer vector of [4 x i64]. +/// \param __imm +/// An immediate integer operand with bits [1:0] determining which vector +/// element is extracted and returned. +/// \returns A 64-bit integer containing the extracted 64 bits of extended +/// packed data. +#define _mm256_extract_epi64(X, N) \ + ((long long)__builtin_ia32_vec_ext_v4di((__v4di)(__m256i)(X), (int)(N))) +#endif + +/// Takes a [8 x i32] vector and replaces the vector element value +/// indexed by the immediate constant operand by a new value. Returns the +/// modified vector. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VINSERTF128+COMPOSITE +/// instruction. +/// +/// \param __a +/// A vector of [8 x i32] to be used by the insert operation. +/// \param __b +/// An integer value. The replacement value for the insert operation. +/// \param __imm +/// An immediate integer specifying the index of the vector element to be +/// replaced. +/// \returns A copy of vector \a __a, after replacing its element indexed by +/// \a __imm with \a __b. +#define _mm256_insert_epi32(X, I, N) \ + ((__m256i)__builtin_ia32_vec_set_v8si((__v8si)(__m256i)(X), \ + (int)(I), (int)(N))) + + +/// Takes a [16 x i16] vector and replaces the vector element value +/// indexed by the immediate constant operand with a new value. Returns the +/// modified vector. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VINSERTF128+COMPOSITE +/// instruction. +/// +/// \param __a +/// A vector of [16 x i16] to be used by the insert operation. +/// \param __b +/// An i16 integer value. The replacement value for the insert operation. +/// \param __imm +/// An immediate integer specifying the index of the vector element to be +/// replaced. +/// \returns A copy of vector \a __a, after replacing its element indexed by +/// \a __imm with \a __b. +#define _mm256_insert_epi16(X, I, N) \ + ((__m256i)__builtin_ia32_vec_set_v16hi((__v16hi)(__m256i)(X), \ + (int)(I), (int)(N))) + +/// Takes a [32 x i8] vector and replaces the vector element value +/// indexed by the immediate constant operand with a new value. Returns the +/// modified vector. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VINSERTF128+COMPOSITE +/// instruction. +/// +/// \param __a +/// A vector of [32 x i8] to be used by the insert operation. +/// \param __b +/// An i8 integer value. The replacement value for the insert operation. +/// \param __imm +/// An immediate integer specifying the index of the vector element to be +/// replaced. +/// \returns A copy of vector \a __a, after replacing its element indexed by +/// \a __imm with \a __b. +#define _mm256_insert_epi8(X, I, N) \ + ((__m256i)__builtin_ia32_vec_set_v32qi((__v32qi)(__m256i)(X), \ + (int)(I), (int)(N))) + +#ifdef __x86_64__ +/// Takes a [4 x i64] vector and replaces the vector element value +/// indexed by the immediate constant operand with a new value. Returns the +/// modified vector. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VINSERTF128+COMPOSITE +/// instruction. +/// +/// \param __a +/// A vector of [4 x i64] to be used by the insert operation. +/// \param __b +/// A 64-bit integer value. The replacement value for the insert operation. +/// \param __imm +/// An immediate integer specifying the index of the vector element to be +/// replaced. +/// \returns A copy of vector \a __a, after replacing its element indexed by +/// \a __imm with \a __b. +#define _mm256_insert_epi64(X, I, N) \ + ((__m256i)__builtin_ia32_vec_set_v4di((__v4di)(__m256i)(X), \ + (long long)(I), (int)(N))) +#endif + +/* Conversion */ +/// Converts a vector of [4 x i32] into a vector of [4 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTDQ2PD instruction. +/// +/// \param __a +/// A 128-bit integer vector of [4 x i32]. +/// \returns A 256-bit vector of [4 x double] containing the converted values. +static __inline __m256d __DEFAULT_FN_ATTRS +_mm256_cvtepi32_pd(__m128i __a) +{ + return (__m256d)__builtin_convertvector((__v4si)__a, __v4df); +} + +/// Converts a vector of [8 x i32] into a vector of [8 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTDQ2PS instruction. +/// +/// \param __a +/// A 256-bit integer vector. +/// \returns A 256-bit vector of [8 x float] containing the converted values. +static __inline __m256 __DEFAULT_FN_ATTRS +_mm256_cvtepi32_ps(__m256i __a) +{ + return (__m256)__builtin_convertvector((__v8si)__a, __v8sf); +} + +/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of +/// [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTPD2PS instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double]. +/// \returns A 128-bit vector of [4 x float] containing the converted values. +static __inline __m128 __DEFAULT_FN_ATTRS +_mm256_cvtpd_ps(__m256d __a) +{ + return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a); +} + +/// Converts a vector of [8 x float] into a vector of [8 x i32]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTPS2DQ instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float]. +/// \returns A 256-bit integer vector containing the converted values. +static __inline __m256i __DEFAULT_FN_ATTRS +_mm256_cvtps_epi32(__m256 __a) +{ + return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a); +} + +/// Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4 +/// x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTPS2PD instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. +/// \returns A 256-bit vector of [4 x double] containing the converted values. +static __inline __m256d __DEFAULT_FN_ATTRS +_mm256_cvtps_pd(__m128 __a) +{ + return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df); +} + +/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4 +/// x i32], truncating the result by rounding towards zero when it is +/// inexact. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTTPD2DQ instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double]. +/// \returns A 128-bit integer vector containing the converted values. +static __inline __m128i __DEFAULT_FN_ATTRS +_mm256_cvttpd_epi32(__m256d __a) +{ + return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a); +} + +/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4 +/// x i32]. When a conversion is inexact, the value returned is rounded +/// according to the rounding control bits in the MXCSR register. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTPD2DQ instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double]. +/// \returns A 128-bit integer vector containing the converted values. +static __inline __m128i __DEFAULT_FN_ATTRS +_mm256_cvtpd_epi32(__m256d __a) +{ + return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a); +} + +/// Converts a vector of [8 x float] into a vector of [8 x i32], +/// truncating the result by rounding towards zero when it is inexact. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTTPS2DQ instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float]. +/// \returns A 256-bit integer vector containing the converted values. +static __inline __m256i __DEFAULT_FN_ATTRS +_mm256_cvttps_epi32(__m256 __a) +{ + return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a); +} + +/// Returns the first element of the input vector of [4 x double]. +/// +/// \headerfile +/// +/// This intrinsic is a utility function and does not correspond to a specific +/// instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double]. +/// \returns A 64 bit double containing the first element of the input vector. +static __inline double __DEFAULT_FN_ATTRS +_mm256_cvtsd_f64(__m256d __a) +{ + return __a[0]; +} + +/// Returns the first element of the input vector of [8 x i32]. +/// +/// \headerfile +/// +/// This intrinsic is a utility function and does not correspond to a specific +/// instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x i32]. +/// \returns A 32 bit integer containing the first element of the input vector. +static __inline int __DEFAULT_FN_ATTRS +_mm256_cvtsi256_si32(__m256i __a) +{ + __v8si __b = (__v8si)__a; + return __b[0]; +} + +/// Returns the first element of the input vector of [8 x float]. +/// +/// \headerfile +/// +/// This intrinsic is a utility function and does not correspond to a specific +/// instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float]. +/// \returns A 32 bit float containing the first element of the input vector. +static __inline float __DEFAULT_FN_ATTRS +_mm256_cvtss_f32(__m256 __a) +{ + return __a[0]; +} + +/* Vector replicate */ +/// Moves and duplicates odd-indexed values from a 256-bit vector of +/// [8 x float] to float values in a 256-bit vector of [8 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVSHDUP instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float]. \n +/// Bits [255:224] of \a __a are written to bits [255:224] and [223:192] of +/// the return value. \n +/// Bits [191:160] of \a __a are written to bits [191:160] and [159:128] of +/// the return value. \n +/// Bits [127:96] of \a __a are written to bits [127:96] and [95:64] of the +/// return value. \n +/// Bits [63:32] of \a __a are written to bits [63:32] and [31:0] of the +/// return value. +/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated +/// values. +static __inline __m256 __DEFAULT_FN_ATTRS +_mm256_movehdup_ps(__m256 __a) +{ + return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7); +} + +/// Moves and duplicates even-indexed values from a 256-bit vector of +/// [8 x float] to float values in a 256-bit vector of [8 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVSLDUP instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float]. \n +/// Bits [223:192] of \a __a are written to bits [255:224] and [223:192] of +/// the return value. \n +/// Bits [159:128] of \a __a are written to bits [191:160] and [159:128] of +/// the return value. \n +/// Bits [95:64] of \a __a are written to bits [127:96] and [95:64] of the +/// return value. \n +/// Bits [31:0] of \a __a are written to bits [63:32] and [31:0] of the +/// return value. +/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated +/// values. +static __inline __m256 __DEFAULT_FN_ATTRS +_mm256_moveldup_ps(__m256 __a) +{ + return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6); +} + +/// Moves and duplicates double-precision floating point values from a +/// 256-bit vector of [4 x double] to double-precision values in a 256-bit +/// vector of [4 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVDDUP instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double]. \n +/// Bits [63:0] of \a __a are written to bits [127:64] and [63:0] of the +/// return value. \n +/// Bits [191:128] of \a __a are written to bits [255:192] and [191:128] of +/// the return value. +/// \returns A 256-bit vector of [4 x double] containing the moved and +/// duplicated values. +static __inline __m256d __DEFAULT_FN_ATTRS +_mm256_movedup_pd(__m256d __a) +{ + return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2); +} + +/* Unpack and Interleave */ +/// Unpacks the odd-indexed vector elements from two 256-bit vectors of +/// [4 x double] and interleaves them into a 256-bit vector of [4 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VUNPCKHPD instruction. +/// +/// \param __a +/// A 256-bit floating-point vector of [4 x double]. \n +/// Bits [127:64] are written to bits [63:0] of the return value. \n +/// Bits [255:192] are written to bits [191:128] of the return value. \n +/// \param __b +/// A 256-bit floating-point vector of [4 x double]. \n +/// Bits [127:64] are written to bits [127:64] of the return value. \n +/// Bits [255:192] are written to bits [255:192] of the return value. \n +/// \returns A 256-bit vector of [4 x double] containing the interleaved values. +static __inline __m256d __DEFAULT_FN_ATTRS +_mm256_unpackhi_pd(__m256d __a, __m256d __b) +{ + return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2); +} + +/// Unpacks the even-indexed vector elements from two 256-bit vectors of +/// [4 x double] and interleaves them into a 256-bit vector of [4 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VUNPCKLPD instruction. +/// +/// \param __a +/// A 256-bit floating-point vector of [4 x double]. \n +/// Bits [63:0] are written to bits [63:0] of the return value. \n +/// Bits [191:128] are written to bits [191:128] of the return value. +/// \param __b +/// A 256-bit floating-point vector of [4 x double]. \n +/// Bits [63:0] are written to bits [127:64] of the return value. \n +/// Bits [191:128] are written to bits [255:192] of the return value. \n +/// \returns A 256-bit vector of [4 x double] containing the interleaved values. +static __inline __m256d __DEFAULT_FN_ATTRS +_mm256_unpacklo_pd(__m256d __a, __m256d __b) +{ + return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2); +} + +/// Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the +/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit +/// vector of [8 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VUNPCKHPS instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float]. \n +/// Bits [95:64] are written to bits [31:0] of the return value. \n +/// Bits [127:96] are written to bits [95:64] of the return value. \n +/// Bits [223:192] are written to bits [159:128] of the return value. \n +/// Bits [255:224] are written to bits [223:192] of the return value. +/// \param __b +/// A 256-bit vector of [8 x float]. \n +/// Bits [95:64] are written to bits [63:32] of the return value. \n +/// Bits [127:96] are written to bits [127:96] of the return value. \n +/// Bits [223:192] are written to bits [191:160] of the return value. \n +/// Bits [255:224] are written to bits [255:224] of the return value. +/// \returns A 256-bit vector of [8 x float] containing the interleaved values. +static __inline __m256 __DEFAULT_FN_ATTRS +_mm256_unpackhi_ps(__m256 __a, __m256 __b) +{ + return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1); +} + +/// Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the +/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit +/// vector of [8 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VUNPCKLPS instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float]. \n +/// Bits [31:0] are written to bits [31:0] of the return value. \n +/// Bits [63:32] are written to bits [95:64] of the return value. \n +/// Bits [159:128] are written to bits [159:128] of the return value. \n +/// Bits [191:160] are written to bits [223:192] of the return value. +/// \param __b +/// A 256-bit vector of [8 x float]. \n +/// Bits [31:0] are written to bits [63:32] of the return value. \n +/// Bits [63:32] are written to bits [127:96] of the return value. \n +/// Bits [159:128] are written to bits [191:160] of the return value. \n +/// Bits [191:160] are written to bits [255:224] of the return value. +/// \returns A 256-bit vector of [8 x float] containing the interleaved values. +static __inline __m256 __DEFAULT_FN_ATTRS +_mm256_unpacklo_ps(__m256 __a, __m256 __b) +{ + return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1); +} + +/* Bit Test */ +/// Given two 128-bit floating-point vectors of [2 x double], perform an +/// element-by-element comparison of the double-precision element in the +/// first source vector and the corresponding element in the second source +/// vector. +/// +/// The EFLAGS register is updated as follows: \n +/// If there is at least one pair of double-precision elements where the +/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the +/// ZF flag is set to 1. \n +/// If there is at least one pair of double-precision elements where the +/// sign-bit of the first element is 0 and the sign-bit of the second element +/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n +/// This intrinsic returns the value of the ZF flag. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VTESTPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. +/// \param __b +/// A 128-bit vector of [2 x double]. +/// \returns the ZF flag in the EFLAGS register. +static __inline int __DEFAULT_FN_ATTRS128 +_mm_testz_pd(__m128d __a, __m128d __b) +{ + return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b); +} + +/// Given two 128-bit floating-point vectors of [2 x double], perform an +/// element-by-element comparison of the double-precision element in the +/// first source vector and the corresponding element in the second source +/// vector. +/// +/// The EFLAGS register is updated as follows: \n +/// If there is at least one pair of double-precision elements where the +/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the +/// ZF flag is set to 1. \n +/// If there is at least one pair of double-precision elements where the +/// sign-bit of the first element is 0 and the sign-bit of the second element +/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n +/// This intrinsic returns the value of the CF flag. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VTESTPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. +/// \param __b +/// A 128-bit vector of [2 x double]. +/// \returns the CF flag in the EFLAGS register. +static __inline int __DEFAULT_FN_ATTRS128 +_mm_testc_pd(__m128d __a, __m128d __b) +{ + return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b); +} + +/// Given two 128-bit floating-point vectors of [2 x double], perform an +/// element-by-element comparison of the double-precision element in the +/// first source vector and the corresponding element in the second source +/// vector. +/// +/// The EFLAGS register is updated as follows: \n +/// If there is at least one pair of double-precision elements where the +/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the +/// ZF flag is set to 1. \n +/// If there is at least one pair of double-precision elements where the +/// sign-bit of the first element is 0 and the sign-bit of the second element +/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n +/// This intrinsic returns 1 if both the ZF and CF flags are set to 0, +/// otherwise it returns 0. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VTESTPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. +/// \param __b +/// A 128-bit vector of [2 x double]. +/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. +static __inline int __DEFAULT_FN_ATTRS128 +_mm_testnzc_pd(__m128d __a, __m128d __b) +{ + return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b); +} + +/// Given two 128-bit floating-point vectors of [4 x float], perform an +/// element-by-element comparison of the single-precision element in the +/// first source vector and the corresponding element in the second source +/// vector. +/// +/// The EFLAGS register is updated as follows: \n +/// If there is at least one pair of single-precision elements where the +/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the +/// ZF flag is set to 1. \n +/// If there is at least one pair of single-precision elements where the +/// sign-bit of the first element is 0 and the sign-bit of the second element +/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n +/// This intrinsic returns the value of the ZF flag. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VTESTPS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. +/// \param __b +/// A 128-bit vector of [4 x float]. +/// \returns the ZF flag. +static __inline int __DEFAULT_FN_ATTRS128 +_mm_testz_ps(__m128 __a, __m128 __b) +{ + return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b); +} + +/// Given two 128-bit floating-point vectors of [4 x float], perform an +/// element-by-element comparison of the single-precision element in the +/// first source vector and the corresponding element in the second source +/// vector. +/// +/// The EFLAGS register is updated as follows: \n +/// If there is at least one pair of single-precision elements where the +/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the +/// ZF flag is set to 1. \n +/// If there is at least one pair of single-precision elements where the +/// sign-bit of the first element is 0 and the sign-bit of the second element +/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n +/// This intrinsic returns the value of the CF flag. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VTESTPS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. +/// \param __b +/// A 128-bit vector of [4 x float]. +/// \returns the CF flag. +static __inline int __DEFAULT_FN_ATTRS128 +_mm_testc_ps(__m128 __a, __m128 __b) +{ + return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b); +} + +/// Given two 128-bit floating-point vectors of [4 x float], perform an +/// element-by-element comparison of the single-precision element in the +/// first source vector and the corresponding element in the second source +/// vector. +/// +/// The EFLAGS register is updated as follows: \n +/// If there is at least one pair of single-precision elements where the +/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the +/// ZF flag is set to 1. \n +/// If there is at least one pair of single-precision elements where the +/// sign-bit of the first element is 0 and the sign-bit of the second element +/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n +/// This intrinsic returns 1 if both the ZF and CF flags are set to 0, +/// otherwise it returns 0. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VTESTPS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. +/// \param __b +/// A 128-bit vector of [4 x float]. +/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. +static __inline int __DEFAULT_FN_ATTRS128 +_mm_testnzc_ps(__m128 __a, __m128 __b) +{ + return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b); +} + +/// Given two 256-bit floating-point vectors of [4 x double], perform an +/// element-by-element comparison of the double-precision elements in the +/// first source vector and the corresponding elements in the second source +/// vector. +/// +/// The EFLAGS register is updated as follows: \n +/// If there is at least one pair of double-precision elements where the +/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the +/// ZF flag is set to 1. \n +/// If there is at least one pair of double-precision elements where the +/// sign-bit of the first element is 0 and the sign-bit of the second element +/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n +/// This intrinsic returns the value of the ZF flag. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VTESTPD instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double]. +/// \param __b +/// A 256-bit vector of [4 x double]. +/// \returns the ZF flag. +static __inline int __DEFAULT_FN_ATTRS +_mm256_testz_pd(__m256d __a, __m256d __b) +{ + return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b); +} + +/// Given two 256-bit floating-point vectors of [4 x double], perform an +/// element-by-element comparison of the double-precision elements in the +/// first source vector and the corresponding elements in the second source +/// vector. +/// +/// The EFLAGS register is updated as follows: \n +/// If there is at least one pair of double-precision elements where the +/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the +/// ZF flag is set to 1. \n +/// If there is at least one pair of double-precision elements where the +/// sign-bit of the first element is 0 and the sign-bit of the second element +/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n +/// This intrinsic returns the value of the CF flag. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VTESTPD instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double]. +/// \param __b +/// A 256-bit vector of [4 x double]. +/// \returns the CF flag. +static __inline int __DEFAULT_FN_ATTRS +_mm256_testc_pd(__m256d __a, __m256d __b) +{ + return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b); +} + +/// Given two 256-bit floating-point vectors of [4 x double], perform an +/// element-by-element comparison of the double-precision elements in the +/// first source vector and the corresponding elements in the second source +/// vector. +/// +/// The EFLAGS register is updated as follows: \n +/// If there is at least one pair of double-precision elements where the +/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the +/// ZF flag is set to 1. \n +/// If there is at least one pair of double-precision elements where the +/// sign-bit of the first element is 0 and the sign-bit of the second element +/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n +/// This intrinsic returns 1 if both the ZF and CF flags are set to 0, +/// otherwise it returns 0. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VTESTPD instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double]. +/// \param __b +/// A 256-bit vector of [4 x double]. +/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. +static __inline int __DEFAULT_FN_ATTRS +_mm256_testnzc_pd(__m256d __a, __m256d __b) +{ + return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b); +} + +/// Given two 256-bit floating-point vectors of [8 x float], perform an +/// element-by-element comparison of the single-precision element in the +/// first source vector and the corresponding element in the second source +/// vector. +/// +/// The EFLAGS register is updated as follows: \n +/// If there is at least one pair of single-precision elements where the +/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the +/// ZF flag is set to 1. \n +/// If there is at least one pair of single-precision elements where the +/// sign-bit of the first element is 0 and the sign-bit of the second element +/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n +/// This intrinsic returns the value of the ZF flag. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VTESTPS instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float]. +/// \param __b +/// A 256-bit vector of [8 x float]. +/// \returns the ZF flag. +static __inline int __DEFAULT_FN_ATTRS +_mm256_testz_ps(__m256 __a, __m256 __b) +{ + return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b); +} + +/// Given two 256-bit floating-point vectors of [8 x float], perform an +/// element-by-element comparison of the single-precision element in the +/// first source vector and the corresponding element in the second source +/// vector. +/// +/// The EFLAGS register is updated as follows: \n +/// If there is at least one pair of single-precision elements where the +/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the +/// ZF flag is set to 1. \n +/// If there is at least one pair of single-precision elements where the +/// sign-bit of the first element is 0 and the sign-bit of the second element +/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n +/// This intrinsic returns the value of the CF flag. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VTESTPS instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float]. +/// \param __b +/// A 256-bit vector of [8 x float]. +/// \returns the CF flag. +static __inline int __DEFAULT_FN_ATTRS +_mm256_testc_ps(__m256 __a, __m256 __b) +{ + return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b); +} + +/// Given two 256-bit floating-point vectors of [8 x float], perform an +/// element-by-element comparison of the single-precision elements in the +/// first source vector and the corresponding elements in the second source +/// vector. +/// +/// The EFLAGS register is updated as follows: \n +/// If there is at least one pair of single-precision elements where the +/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the +/// ZF flag is set to 1. \n +/// If there is at least one pair of single-precision elements where the +/// sign-bit of the first element is 0 and the sign-bit of the second element +/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n +/// This intrinsic returns 1 if both the ZF and CF flags are set to 0, +/// otherwise it returns 0. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VTESTPS instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float]. +/// \param __b +/// A 256-bit vector of [8 x float]. +/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. +static __inline int __DEFAULT_FN_ATTRS +_mm256_testnzc_ps(__m256 __a, __m256 __b) +{ + return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b); +} + +/// Given two 256-bit integer vectors, perform a bit-by-bit comparison +/// of the two source vectors. +/// +/// The EFLAGS register is updated as follows: \n +/// If there is at least one pair of bits where both bits are 1, the ZF flag +/// is set to 0. Otherwise the ZF flag is set to 1. \n +/// If there is at least one pair of bits where the bit from the first source +/// vector is 0 and the bit from the second source vector is 1, the CF flag +/// is set to 0. Otherwise the CF flag is set to 1. \n +/// This intrinsic returns the value of the ZF flag. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPTEST instruction. +/// +/// \param __a +/// A 256-bit integer vector. +/// \param __b +/// A 256-bit integer vector. +/// \returns the ZF flag. +static __inline int __DEFAULT_FN_ATTRS +_mm256_testz_si256(__m256i __a, __m256i __b) +{ + return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b); +} + +/// Given two 256-bit integer vectors, perform a bit-by-bit comparison +/// of the two source vectors. +/// +/// The EFLAGS register is updated as follows: \n +/// If there is at least one pair of bits where both bits are 1, the ZF flag +/// is set to 0. Otherwise the ZF flag is set to 1. \n +/// If there is at least one pair of bits where the bit from the first source +/// vector is 0 and the bit from the second source vector is 1, the CF flag +/// is set to 0. Otherwise the CF flag is set to 1. \n +/// This intrinsic returns the value of the CF flag. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPTEST instruction. +/// +/// \param __a +/// A 256-bit integer vector. +/// \param __b +/// A 256-bit integer vector. +/// \returns the CF flag. +static __inline int __DEFAULT_FN_ATTRS +_mm256_testc_si256(__m256i __a, __m256i __b) +{ + return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b); +} + +/// Given two 256-bit integer vectors, perform a bit-by-bit comparison +/// of the two source vectors. +/// +/// The EFLAGS register is updated as follows: \n +/// If there is at least one pair of bits where both bits are 1, the ZF flag +/// is set to 0. Otherwise the ZF flag is set to 1. \n +/// If there is at least one pair of bits where the bit from the first source +/// vector is 0 and the bit from the second source vector is 1, the CF flag +/// is set to 0. Otherwise the CF flag is set to 1. \n +/// This intrinsic returns 1 if both the ZF and CF flags are set to 0, +/// otherwise it returns 0. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPTEST instruction. +/// +/// \param __a +/// A 256-bit integer vector. +/// \param __b +/// A 256-bit integer vector. +/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. +static __inline int __DEFAULT_FN_ATTRS +_mm256_testnzc_si256(__m256i __a, __m256i __b) +{ + return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b); +} + +/* Vector extract sign mask */ +/// Extracts the sign bits of double-precision floating point elements +/// in a 256-bit vector of [4 x double] and writes them to the lower order +/// bits of the return value. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVMSKPD instruction. +/// +/// \param __a +/// A 256-bit vector of [4 x double] containing the double-precision +/// floating point values with sign bits to be extracted. +/// \returns The sign bits from the operand, written to bits [3:0]. +static __inline int __DEFAULT_FN_ATTRS +_mm256_movemask_pd(__m256d __a) +{ + return __builtin_ia32_movmskpd256((__v4df)__a); +} + +/// Extracts the sign bits of single-precision floating point elements +/// in a 256-bit vector of [8 x float] and writes them to the lower order +/// bits of the return value. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVMSKPS instruction. +/// +/// \param __a +/// A 256-bit vector of [8 x float] containing the single-precision floating +/// point values with sign bits to be extracted. +/// \returns The sign bits from the operand, written to bits [7:0]. +static __inline int __DEFAULT_FN_ATTRS +_mm256_movemask_ps(__m256 __a) +{ + return __builtin_ia32_movmskps256((__v8sf)__a); +} + +/* Vector __zero */ +/// Zeroes the contents of all XMM or YMM registers. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VZEROALL instruction. +static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx"))) +_mm256_zeroall(void) +{ + __builtin_ia32_vzeroall(); +} + +/// Zeroes the upper 128 bits (bits 255:128) of all YMM registers. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VZEROUPPER instruction. +static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx"))) +_mm256_zeroupper(void) +{ + __builtin_ia32_vzeroupper(); +} + +/* Vector load with broadcast */ +/// Loads a scalar single-precision floating point value from the +/// specified address pointed to by \a __a and broadcasts it to the elements +/// of a [4 x float] vector. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VBROADCASTSS instruction. +/// +/// \param __a +/// The single-precision floating point value to be broadcast. +/// \returns A 128-bit vector of [4 x float] whose 32-bit elements are set +/// equal to the broadcast value. +static __inline __m128 __DEFAULT_FN_ATTRS128 +_mm_broadcast_ss(float const *__a) +{ + float __f = *__a; + return __extension__ (__m128)(__v4sf){ __f, __f, __f, __f }; +} + +/// Loads a scalar double-precision floating point value from the +/// specified address pointed to by \a __a and broadcasts it to the elements +/// of a [4 x double] vector. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VBROADCASTSD instruction. +/// +/// \param __a +/// The double-precision floating point value to be broadcast. +/// \returns A 256-bit vector of [4 x double] whose 64-bit elements are set +/// equal to the broadcast value. +static __inline __m256d __DEFAULT_FN_ATTRS +_mm256_broadcast_sd(double const *__a) +{ + double __d = *__a; + return __extension__ (__m256d)(__v4df){ __d, __d, __d, __d }; +} + +/// Loads a scalar single-precision floating point value from the +/// specified address pointed to by \a __a and broadcasts it to the elements +/// of a [8 x float] vector. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VBROADCASTSS instruction. +/// +/// \param __a +/// The single-precision floating point value to be broadcast. +/// \returns A 256-bit vector of [8 x float] whose 32-bit elements are set +/// equal to the broadcast value. +static __inline __m256 __DEFAULT_FN_ATTRS +_mm256_broadcast_ss(float const *__a) +{ + float __f = *__a; + return __extension__ (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f }; +} + +/// Loads the data from a 128-bit vector of [2 x double] from the +/// specified address pointed to by \a __a and broadcasts it to 128-bit +/// elements in a 256-bit vector of [4 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VBROADCASTF128 instruction. +/// +/// \param __a +/// The 128-bit vector of [2 x double] to be broadcast. +/// \returns A 256-bit vector of [4 x double] whose 128-bit elements are set +/// equal to the broadcast value. +static __inline __m256d __DEFAULT_FN_ATTRS +_mm256_broadcast_pd(__m128d const *__a) +{ + __m128d __b = _mm_loadu_pd((const double *)__a); + return (__m256d)__builtin_shufflevector((__v2df)__b, (__v2df)__b, + 0, 1, 0, 1); +} + +/// Loads the data from a 128-bit vector of [4 x float] from the +/// specified address pointed to by \a __a and broadcasts it to 128-bit +/// elements in a 256-bit vector of [8 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VBROADCASTF128 instruction. +/// +/// \param __a +/// The 128-bit vector of [4 x float] to be broadcast. +/// \returns A 256-bit vector of [8 x float] whose 128-bit elements are set +/// equal to the broadcast value. +static __inline __m256 __DEFAULT_FN_ATTRS +_mm256_broadcast_ps(__m128 const *__a) +{ + __m128 __b = _mm_loadu_ps((const float *)__a); + return (__m256)__builtin_shufflevector((__v4sf)__b, (__v4sf)__b, + 0, 1, 2, 3, 0, 1, 2, 3); +} + +/* SIMD load ops */ +/// Loads 4 double-precision floating point values from a 32-byte aligned +/// memory location pointed to by \a __p into a vector of [4 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVAPD instruction. +/// +/// \param __p +/// A 32-byte aligned pointer to a memory location containing +/// double-precision floating point values. +/// \returns A 256-bit vector of [4 x double] containing the moved values. +static __inline __m256d __DEFAULT_FN_ATTRS +_mm256_load_pd(double const *__p) +{ + return *(const __m256d *)__p; +} + +/// Loads 8 single-precision floating point values from a 32-byte aligned +/// memory location pointed to by \a __p into a vector of [8 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVAPS instruction. +/// +/// \param __p +/// A 32-byte aligned pointer to a memory location containing float values. +/// \returns A 256-bit vector of [8 x float] containing the moved values. +static __inline __m256 __DEFAULT_FN_ATTRS +_mm256_load_ps(float const *__p) +{ + return *(const __m256 *)__p; +} + +/// Loads 4 double-precision floating point values from an unaligned +/// memory location pointed to by \a __p into a vector of [4 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVUPD instruction. +/// +/// \param __p +/// A pointer to a memory location containing double-precision floating +/// point values. +/// \returns A 256-bit vector of [4 x double] containing the moved values. +static __inline __m256d __DEFAULT_FN_ATTRS +_mm256_loadu_pd(double const *__p) +{ + struct __loadu_pd { + __m256d_u __v; + } __attribute__((__packed__, __may_alias__)); + return ((const struct __loadu_pd*)__p)->__v; +} + +/// Loads 8 single-precision floating point values from an unaligned +/// memory location pointed to by \a __p into a vector of [8 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVUPS instruction. +/// +/// \param __p +/// A pointer to a memory location containing single-precision floating +/// point values. +/// \returns A 256-bit vector of [8 x float] containing the moved values. +static __inline __m256 __DEFAULT_FN_ATTRS +_mm256_loadu_ps(float const *__p) +{ + struct __loadu_ps { + __m256_u __v; + } __attribute__((__packed__, __may_alias__)); + return ((const struct __loadu_ps*)__p)->__v; +} + +/// Loads 256 bits of integer data from a 32-byte aligned memory +/// location pointed to by \a __p into elements of a 256-bit integer vector. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVDQA instruction. +/// +/// \param __p +/// A 32-byte aligned pointer to a 256-bit integer vector containing integer +/// values. +/// \returns A 256-bit integer vector containing the moved values. +static __inline __m256i __DEFAULT_FN_ATTRS +_mm256_load_si256(__m256i const *__p) +{ + return *__p; +} + +/// Loads 256 bits of integer data from an unaligned memory location +/// pointed to by \a __p into a 256-bit integer vector. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVDQU instruction. +/// +/// \param __p +/// A pointer to a 256-bit integer vector containing integer values. +/// \returns A 256-bit integer vector containing the moved values. +static __inline __m256i __DEFAULT_FN_ATTRS +_mm256_loadu_si256(__m256i_u const *__p) +{ + struct __loadu_si256 { + __m256i_u __v; + } __attribute__((__packed__, __may_alias__)); + return ((const struct __loadu_si256*)__p)->__v; +} + +/// Loads 256 bits of integer data from an unaligned memory location +/// pointed to by \a __p into a 256-bit integer vector. This intrinsic may +/// perform better than \c _mm256_loadu_si256 when the data crosses a cache +/// line boundary. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VLDDQU instruction. +/// +/// \param __p +/// A pointer to a 256-bit integer vector containing integer values. +/// \returns A 256-bit integer vector containing the moved values. +static __inline __m256i __DEFAULT_FN_ATTRS +_mm256_lddqu_si256(__m256i const *__p) +{ + return (__m256i)__builtin_ia32_lddqu256((char const *)__p); +} + +/* SIMD store ops */ +/// Stores double-precision floating point values from a 256-bit vector +/// of [4 x double] to a 32-byte aligned memory location pointed to by +/// \a __p. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVAPD instruction. +/// +/// \param __p +/// A 32-byte aligned pointer to a memory location that will receive the +/// double-precision floaing point values. +/// \param __a +/// A 256-bit vector of [4 x double] containing the values to be moved. +static __inline void __DEFAULT_FN_ATTRS +_mm256_store_pd(double *__p, __m256d __a) +{ + *(__m256d *)__p = __a; +} + +/// Stores single-precision floating point values from a 256-bit vector +/// of [8 x float] to a 32-byte aligned memory location pointed to by \a __p. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVAPS instruction. +/// +/// \param __p +/// A 32-byte aligned pointer to a memory location that will receive the +/// float values. +/// \param __a +/// A 256-bit vector of [8 x float] containing the values to be moved. +static __inline void __DEFAULT_FN_ATTRS +_mm256_store_ps(float *__p, __m256 __a) +{ + *(__m256 *)__p = __a; +} + +/// Stores double-precision floating point values from a 256-bit vector +/// of [4 x double] to an unaligned memory location pointed to by \a __p. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVUPD instruction. +/// +/// \param __p +/// A pointer to a memory location that will receive the double-precision +/// floating point values. +/// \param __a +/// A 256-bit vector of [4 x double] containing the values to be moved. +static __inline void __DEFAULT_FN_ATTRS +_mm256_storeu_pd(double *__p, __m256d __a) +{ + struct __storeu_pd { + __m256d_u __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_pd*)__p)->__v = __a; +} + +/// Stores single-precision floating point values from a 256-bit vector +/// of [8 x float] to an unaligned memory location pointed to by \a __p. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVUPS instruction. +/// +/// \param __p +/// A pointer to a memory location that will receive the float values. +/// \param __a +/// A 256-bit vector of [8 x float] containing the values to be moved. +static __inline void __DEFAULT_FN_ATTRS +_mm256_storeu_ps(float *__p, __m256 __a) +{ + struct __storeu_ps { + __m256_u __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_ps*)__p)->__v = __a; +} + +/// Stores integer values from a 256-bit integer vector to a 32-byte +/// aligned memory location pointed to by \a __p. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVDQA instruction. +/// +/// \param __p +/// A 32-byte aligned pointer to a memory location that will receive the +/// integer values. +/// \param __a +/// A 256-bit integer vector containing the values to be moved. +static __inline void __DEFAULT_FN_ATTRS +_mm256_store_si256(__m256i *__p, __m256i __a) +{ + *__p = __a; +} + +/// Stores integer values from a 256-bit integer vector to an unaligned +/// memory location pointed to by \a __p. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVDQU instruction. +/// +/// \param __p +/// A pointer to a memory location that will receive the integer values. +/// \param __a +/// A 256-bit integer vector containing the values to be moved. +static __inline void __DEFAULT_FN_ATTRS +_mm256_storeu_si256(__m256i_u *__p, __m256i __a) +{ + struct __storeu_si256 { + __m256i_u __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_si256*)__p)->__v = __a; +} + +/* Conditional load ops */ +/// Conditionally loads double-precision floating point elements from a +/// memory location pointed to by \a __p into a 128-bit vector of +/// [2 x double], depending on the mask bits associated with each data +/// element. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMASKMOVPD instruction. +/// +/// \param __p +/// A pointer to a memory location that contains the double-precision +/// floating point values. +/// \param __m +/// A 128-bit integer vector containing the mask. The most significant bit of +/// each data element represents the mask bits. If a mask bit is zero, the +/// corresponding value in the memory location is not loaded and the +/// corresponding field in the return value is set to zero. +/// \returns A 128-bit vector of [2 x double] containing the loaded values. +static __inline __m128d __DEFAULT_FN_ATTRS128 +_mm_maskload_pd(double const *__p, __m128i __m) +{ + return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m); +} + +/// Conditionally loads double-precision floating point elements from a +/// memory location pointed to by \a __p into a 256-bit vector of +/// [4 x double], depending on the mask bits associated with each data +/// element. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMASKMOVPD instruction. +/// +/// \param __p +/// A pointer to a memory location that contains the double-precision +/// floating point values. +/// \param __m +/// A 256-bit integer vector of [4 x quadword] containing the mask. The most +/// significant bit of each quadword element represents the mask bits. If a +/// mask bit is zero, the corresponding value in the memory location is not +/// loaded and the corresponding field in the return value is set to zero. +/// \returns A 256-bit vector of [4 x double] containing the loaded values. +static __inline __m256d __DEFAULT_FN_ATTRS +_mm256_maskload_pd(double const *__p, __m256i __m) +{ + return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p, + (__v4di)__m); +} + +/// Conditionally loads single-precision floating point elements from a +/// memory location pointed to by \a __p into a 128-bit vector of +/// [4 x float], depending on the mask bits associated with each data +/// element. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMASKMOVPS instruction. +/// +/// \param __p +/// A pointer to a memory location that contains the single-precision +/// floating point values. +/// \param __m +/// A 128-bit integer vector containing the mask. The most significant bit of +/// each data element represents the mask bits. If a mask bit is zero, the +/// corresponding value in the memory location is not loaded and the +/// corresponding field in the return value is set to zero. +/// \returns A 128-bit vector of [4 x float] containing the loaded values. +static __inline __m128 __DEFAULT_FN_ATTRS128 +_mm_maskload_ps(float const *__p, __m128i __m) +{ + return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m); +} + +/// Conditionally loads single-precision floating point elements from a +/// memory location pointed to by \a __p into a 256-bit vector of +/// [8 x float], depending on the mask bits associated with each data +/// element. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMASKMOVPS instruction. +/// +/// \param __p +/// A pointer to a memory location that contains the single-precision +/// floating point values. +/// \param __m +/// A 256-bit integer vector of [8 x dword] containing the mask. The most +/// significant bit of each dword element represents the mask bits. If a mask +/// bit is zero, the corresponding value in the memory location is not loaded +/// and the corresponding field in the return value is set to zero. +/// \returns A 256-bit vector of [8 x float] containing the loaded values. +static __inline __m256 __DEFAULT_FN_ATTRS +_mm256_maskload_ps(float const *__p, __m256i __m) +{ + return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m); +} + +/* Conditional store ops */ +/// Moves single-precision floating point values from a 256-bit vector +/// of [8 x float] to a memory location pointed to by \a __p, according to +/// the specified mask. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMASKMOVPS instruction. +/// +/// \param __p +/// A pointer to a memory location that will receive the float values. +/// \param __m +/// A 256-bit integer vector of [8 x dword] containing the mask. The most +/// significant bit of each dword element in the mask vector represents the +/// mask bits. If a mask bit is zero, the corresponding value from vector +/// \a __a is not stored and the corresponding field in the memory location +/// pointed to by \a __p is not changed. +/// \param __a +/// A 256-bit vector of [8 x float] containing the values to be stored. +static __inline void __DEFAULT_FN_ATTRS +_mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a) +{ + __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a); +} + +/// Moves double-precision values from a 128-bit vector of [2 x double] +/// to a memory location pointed to by \a __p, according to the specified +/// mask. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMASKMOVPD instruction. +/// +/// \param __p +/// A pointer to a memory location that will receive the float values. +/// \param __m +/// A 128-bit integer vector containing the mask. The most significant bit of +/// each field in the mask vector represents the mask bits. If a mask bit is +/// zero, the corresponding value from vector \a __a is not stored and the +/// corresponding field in the memory location pointed to by \a __p is not +/// changed. +/// \param __a +/// A 128-bit vector of [2 x double] containing the values to be stored. +static __inline void __DEFAULT_FN_ATTRS128 +_mm_maskstore_pd(double *__p, __m128i __m, __m128d __a) +{ + __builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a); +} + +/// Moves double-precision values from a 256-bit vector of [4 x double] +/// to a memory location pointed to by \a __p, according to the specified +/// mask. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMASKMOVPD instruction. +/// +/// \param __p +/// A pointer to a memory location that will receive the float values. +/// \param __m +/// A 256-bit integer vector of [4 x quadword] containing the mask. The most +/// significant bit of each quadword element in the mask vector represents +/// the mask bits. If a mask bit is zero, the corresponding value from vector +/// __a is not stored and the corresponding field in the memory location +/// pointed to by \a __p is not changed. +/// \param __a +/// A 256-bit vector of [4 x double] containing the values to be stored. +static __inline void __DEFAULT_FN_ATTRS +_mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a) +{ + __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a); +} + +/// Moves single-precision floating point values from a 128-bit vector +/// of [4 x float] to a memory location pointed to by \a __p, according to +/// the specified mask. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMASKMOVPS instruction. +/// +/// \param __p +/// A pointer to a memory location that will receive the float values. +/// \param __m +/// A 128-bit integer vector containing the mask. The most significant bit of +/// each field in the mask vector represents the mask bits. If a mask bit is +/// zero, the corresponding value from vector __a is not stored and the +/// corresponding field in the memory location pointed to by \a __p is not +/// changed. +/// \param __a +/// A 128-bit vector of [4 x float] containing the values to be stored. +static __inline void __DEFAULT_FN_ATTRS128 +_mm_maskstore_ps(float *__p, __m128i __m, __m128 __a) +{ + __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a); +} + +/* Cacheability support ops */ +/// Moves integer data from a 256-bit integer vector to a 32-byte +/// aligned memory location. To minimize caching, the data is flagged as +/// non-temporal (unlikely to be used again soon). +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVNTDQ instruction. +/// +/// \param __a +/// A pointer to a 32-byte aligned memory location that will receive the +/// integer values. +/// \param __b +/// A 256-bit integer vector containing the values to be moved. +static __inline void __DEFAULT_FN_ATTRS +_mm256_stream_si256(__m256i *__a, __m256i __b) +{ + typedef __v4di __v4di_aligned __attribute__((aligned(32))); + __builtin_nontemporal_store((__v4di_aligned)__b, (__v4di_aligned*)__a); +} + +/// Moves double-precision values from a 256-bit vector of [4 x double] +/// to a 32-byte aligned memory location. To minimize caching, the data is +/// flagged as non-temporal (unlikely to be used again soon). +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVNTPD instruction. +/// +/// \param __a +/// A pointer to a 32-byte aligned memory location that will receive the +/// double-precision floating-point values. +/// \param __b +/// A 256-bit vector of [4 x double] containing the values to be moved. +static __inline void __DEFAULT_FN_ATTRS +_mm256_stream_pd(double *__a, __m256d __b) +{ + typedef __v4df __v4df_aligned __attribute__((aligned(32))); + __builtin_nontemporal_store((__v4df_aligned)__b, (__v4df_aligned*)__a); +} + +/// Moves single-precision floating point values from a 256-bit vector +/// of [8 x float] to a 32-byte aligned memory location. To minimize +/// caching, the data is flagged as non-temporal (unlikely to be used again +/// soon). +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVNTPS instruction. +/// +/// \param __p +/// A pointer to a 32-byte aligned memory location that will receive the +/// single-precision floating point values. +/// \param __a +/// A 256-bit vector of [8 x float] containing the values to be moved. +static __inline void __DEFAULT_FN_ATTRS +_mm256_stream_ps(float *__p, __m256 __a) +{ + typedef __v8sf __v8sf_aligned __attribute__((aligned(32))); + __builtin_nontemporal_store((__v8sf_aligned)__a, (__v8sf_aligned*)__p); +} + +/* Create vectors */ +/// Create a 256-bit vector of [4 x double] with undefined values. +/// +/// \headerfile +/// +/// This intrinsic has no corresponding instruction. +/// +/// \returns A 256-bit vector of [4 x double] containing undefined values. +static __inline__ __m256d __DEFAULT_FN_ATTRS +_mm256_undefined_pd(void) +{ + return (__m256d)__builtin_ia32_undef256(); +} + +/// Create a 256-bit vector of [8 x float] with undefined values. +/// +/// \headerfile +/// +/// This intrinsic has no corresponding instruction. +/// +/// \returns A 256-bit vector of [8 x float] containing undefined values. +static __inline__ __m256 __DEFAULT_FN_ATTRS +_mm256_undefined_ps(void) +{ + return (__m256)__builtin_ia32_undef256(); +} + +/// Create a 256-bit integer vector with undefined values. +/// +/// \headerfile +/// +/// This intrinsic has no corresponding instruction. +/// +/// \returns A 256-bit integer vector containing undefined values. +static __inline__ __m256i __DEFAULT_FN_ATTRS +_mm256_undefined_si256(void) +{ + return (__m256i)__builtin_ia32_undef256(); +} + +/// Constructs a 256-bit floating-point vector of [4 x double] +/// initialized with the specified double-precision floating-point values. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VUNPCKLPD+VINSERTF128 +/// instruction. +/// +/// \param __a +/// A double-precision floating-point value used to initialize bits [255:192] +/// of the result. +/// \param __b +/// A double-precision floating-point value used to initialize bits [191:128] +/// of the result. +/// \param __c +/// A double-precision floating-point value used to initialize bits [127:64] +/// of the result. +/// \param __d +/// A double-precision floating-point value used to initialize bits [63:0] +/// of the result. +/// \returns An initialized 256-bit floating-point vector of [4 x double]. +static __inline __m256d __DEFAULT_FN_ATTRS +_mm256_set_pd(double __a, double __b, double __c, double __d) +{ + return __extension__ (__m256d){ __d, __c, __b, __a }; +} + +/// Constructs a 256-bit floating-point vector of [8 x float] initialized +/// with the specified single-precision floating-point values. +/// +/// \headerfile +/// +/// This intrinsic is a utility function and does not correspond to a specific +/// instruction. +/// +/// \param __a +/// A single-precision floating-point value used to initialize bits [255:224] +/// of the result. +/// \param __b +/// A single-precision floating-point value used to initialize bits [223:192] +/// of the result. +/// \param __c +/// A single-precision floating-point value used to initialize bits [191:160] +/// of the result. +/// \param __d +/// A single-precision floating-point value used to initialize bits [159:128] +/// of the result. +/// \param __e +/// A single-precision floating-point value used to initialize bits [127:96] +/// of the result. +/// \param __f +/// A single-precision floating-point value used to initialize bits [95:64] +/// of the result. +/// \param __g +/// A single-precision floating-point value used to initialize bits [63:32] +/// of the result. +/// \param __h +/// A single-precision floating-point value used to initialize bits [31:0] +/// of the result. +/// \returns An initialized 256-bit floating-point vector of [8 x float]. +static __inline __m256 __DEFAULT_FN_ATTRS +_mm256_set_ps(float __a, float __b, float __c, float __d, + float __e, float __f, float __g, float __h) +{ + return __extension__ (__m256){ __h, __g, __f, __e, __d, __c, __b, __a }; +} + +/// Constructs a 256-bit integer vector initialized with the specified +/// 32-bit integral values. +/// +/// \headerfile +/// +/// This intrinsic is a utility function and does not correspond to a specific +/// instruction. +/// +/// \param __i0 +/// A 32-bit integral value used to initialize bits [255:224] of the result. +/// \param __i1 +/// A 32-bit integral value used to initialize bits [223:192] of the result. +/// \param __i2 +/// A 32-bit integral value used to initialize bits [191:160] of the result. +/// \param __i3 +/// A 32-bit integral value used to initialize bits [159:128] of the result. +/// \param __i4 +/// A 32-bit integral value used to initialize bits [127:96] of the result. +/// \param __i5 +/// A 32-bit integral value used to initialize bits [95:64] of the result. +/// \param __i6 +/// A 32-bit integral value used to initialize bits [63:32] of the result. +/// \param __i7 +/// A 32-bit integral value used to initialize bits [31:0] of the result. +/// \returns An initialized 256-bit integer vector. +static __inline __m256i __DEFAULT_FN_ATTRS +_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3, + int __i4, int __i5, int __i6, int __i7) +{ + return __extension__ (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 }; +} + +/// Constructs a 256-bit integer vector initialized with the specified +/// 16-bit integral values. +/// +/// \headerfile +/// +/// This intrinsic is a utility function and does not correspond to a specific +/// instruction. +/// +/// \param __w15 +/// A 16-bit integral value used to initialize bits [255:240] of the result. +/// \param __w14 +/// A 16-bit integral value used to initialize bits [239:224] of the result. +/// \param __w13 +/// A 16-bit integral value used to initialize bits [223:208] of the result. +/// \param __w12 +/// A 16-bit integral value used to initialize bits [207:192] of the result. +/// \param __w11 +/// A 16-bit integral value used to initialize bits [191:176] of the result. +/// \param __w10 +/// A 16-bit integral value used to initialize bits [175:160] of the result. +/// \param __w09 +/// A 16-bit integral value used to initialize bits [159:144] of the result. +/// \param __w08 +/// A 16-bit integral value used to initialize bits [143:128] of the result. +/// \param __w07 +/// A 16-bit integral value used to initialize bits [127:112] of the result. +/// \param __w06 +/// A 16-bit integral value used to initialize bits [111:96] of the result. +/// \param __w05 +/// A 16-bit integral value used to initialize bits [95:80] of the result. +/// \param __w04 +/// A 16-bit integral value used to initialize bits [79:64] of the result. +/// \param __w03 +/// A 16-bit integral value used to initialize bits [63:48] of the result. +/// \param __w02 +/// A 16-bit integral value used to initialize bits [47:32] of the result. +/// \param __w01 +/// A 16-bit integral value used to initialize bits [31:16] of the result. +/// \param __w00 +/// A 16-bit integral value used to initialize bits [15:0] of the result. +/// \returns An initialized 256-bit integer vector. +static __inline __m256i __DEFAULT_FN_ATTRS +_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12, + short __w11, short __w10, short __w09, short __w08, + short __w07, short __w06, short __w05, short __w04, + short __w03, short __w02, short __w01, short __w00) +{ + return __extension__ (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06, + __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 }; +} + +/// Constructs a 256-bit integer vector initialized with the specified +/// 8-bit integral values. +/// +/// \headerfile +/// +/// This intrinsic is a utility function and does not correspond to a specific +/// instruction. +/// +/// \param __b31 +/// An 8-bit integral value used to initialize bits [255:248] of the result. +/// \param __b30 +/// An 8-bit integral value used to initialize bits [247:240] of the result. +/// \param __b29 +/// An 8-bit integral value used to initialize bits [239:232] of the result. +/// \param __b28 +/// An 8-bit integral value used to initialize bits [231:224] of the result. +/// \param __b27 +/// An 8-bit integral value used to initialize bits [223:216] of the result. +/// \param __b26 +/// An 8-bit integral value used to initialize bits [215:208] of the result. +/// \param __b25 +/// An 8-bit integral value used to initialize bits [207:200] of the result. +/// \param __b24 +/// An 8-bit integral value used to initialize bits [199:192] of the result. +/// \param __b23 +/// An 8-bit integral value used to initialize bits [191:184] of the result. +/// \param __b22 +/// An 8-bit integral value used to initialize bits [183:176] of the result. +/// \param __b21 +/// An 8-bit integral value used to initialize bits [175:168] of the result. +/// \param __b20 +/// An 8-bit integral value used to initialize bits [167:160] of the result. +/// \param __b19 +/// An 8-bit integral value used to initialize bits [159:152] of the result. +/// \param __b18 +/// An 8-bit integral value used to initialize bits [151:144] of the result. +/// \param __b17 +/// An 8-bit integral value used to initialize bits [143:136] of the result. +/// \param __b16 +/// An 8-bit integral value used to initialize bits [135:128] of the result. +/// \param __b15 +/// An 8-bit integral value used to initialize bits [127:120] of the result. +/// \param __b14 +/// An 8-bit integral value used to initialize bits [119:112] of the result. +/// \param __b13 +/// An 8-bit integral value used to initialize bits [111:104] of the result. +/// \param __b12 +/// An 8-bit integral value used to initialize bits [103:96] of the result. +/// \param __b11 +/// An 8-bit integral value used to initialize bits [95:88] of the result. +/// \param __b10 +/// An 8-bit integral value used to initialize bits [87:80] of the result. +/// \param __b09 +/// An 8-bit integral value used to initialize bits [79:72] of the result. +/// \param __b08 +/// An 8-bit integral value used to initialize bits [71:64] of the result. +/// \param __b07 +/// An 8-bit integral value used to initialize bits [63:56] of the result. +/// \param __b06 +/// An 8-bit integral value used to initialize bits [55:48] of the result. +/// \param __b05 +/// An 8-bit integral value used to initialize bits [47:40] of the result. +/// \param __b04 +/// An 8-bit integral value used to initialize bits [39:32] of the result. +/// \param __b03 +/// An 8-bit integral value used to initialize bits [31:24] of the result. +/// \param __b02 +/// An 8-bit integral value used to initialize bits [23:16] of the result. +/// \param __b01 +/// An 8-bit integral value used to initialize bits [15:8] of the result. +/// \param __b00 +/// An 8-bit integral value used to initialize bits [7:0] of the result. +/// \returns An initialized 256-bit integer vector. +static __inline __m256i __DEFAULT_FN_ATTRS +_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28, + char __b27, char __b26, char __b25, char __b24, + char __b23, char __b22, char __b21, char __b20, + char __b19, char __b18, char __b17, char __b16, + char __b15, char __b14, char __b13, char __b12, + char __b11, char __b10, char __b09, char __b08, + char __b07, char __b06, char __b05, char __b04, + char __b03, char __b02, char __b01, char __b00) +{ + return __extension__ (__m256i)(__v32qi){ + __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07, + __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15, + __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23, + __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31 + }; +} + +/// Constructs a 256-bit integer vector initialized with the specified +/// 64-bit integral values. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPUNPCKLQDQ+VINSERTF128 +/// instruction. +/// +/// \param __a +/// A 64-bit integral value used to initialize bits [255:192] of the result. +/// \param __b +/// A 64-bit integral value used to initialize bits [191:128] of the result. +/// \param __c +/// A 64-bit integral value used to initialize bits [127:64] of the result. +/// \param __d +/// A 64-bit integral value used to initialize bits [63:0] of the result. +/// \returns An initialized 256-bit integer vector. +static __inline __m256i __DEFAULT_FN_ATTRS +_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d) +{ + return __extension__ (__m256i)(__v4di){ __d, __c, __b, __a }; +} + +/* Create vectors with elements in reverse order */ +/// Constructs a 256-bit floating-point vector of [4 x double], +/// initialized in reverse order with the specified double-precision +/// floating-point values. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VUNPCKLPD+VINSERTF128 +/// instruction. +/// +/// \param __a +/// A double-precision floating-point value used to initialize bits [63:0] +/// of the result. +/// \param __b +/// A double-precision floating-point value used to initialize bits [127:64] +/// of the result. +/// \param __c +/// A double-precision floating-point value used to initialize bits [191:128] +/// of the result. +/// \param __d +/// A double-precision floating-point value used to initialize bits [255:192] +/// of the result. +/// \returns An initialized 256-bit floating-point vector of [4 x double]. +static __inline __m256d __DEFAULT_FN_ATTRS +_mm256_setr_pd(double __a, double __b, double __c, double __d) +{ + return _mm256_set_pd(__d, __c, __b, __a); +} + +/// Constructs a 256-bit floating-point vector of [8 x float], +/// initialized in reverse order with the specified single-precision +/// float-point values. +/// +/// \headerfile +/// +/// This intrinsic is a utility function and does not correspond to a specific +/// instruction. +/// +/// \param __a +/// A single-precision floating-point value used to initialize bits [31:0] +/// of the result. +/// \param __b +/// A single-precision floating-point value used to initialize bits [63:32] +/// of the result. +/// \param __c +/// A single-precision floating-point value used to initialize bits [95:64] +/// of the result. +/// \param __d +/// A single-precision floating-point value used to initialize bits [127:96] +/// of the result. +/// \param __e +/// A single-precision floating-point value used to initialize bits [159:128] +/// of the result. +/// \param __f +/// A single-precision floating-point value used to initialize bits [191:160] +/// of the result. +/// \param __g +/// A single-precision floating-point value used to initialize bits [223:192] +/// of the result. +/// \param __h +/// A single-precision floating-point value used to initialize bits [255:224] +/// of the result. +/// \returns An initialized 256-bit floating-point vector of [8 x float]. +static __inline __m256 __DEFAULT_FN_ATTRS +_mm256_setr_ps(float __a, float __b, float __c, float __d, + float __e, float __f, float __g, float __h) +{ + return _mm256_set_ps(__h, __g, __f, __e, __d, __c, __b, __a); +} + +/// Constructs a 256-bit integer vector, initialized in reverse order +/// with the specified 32-bit integral values. +/// +/// \headerfile +/// +/// This intrinsic is a utility function and does not correspond to a specific +/// instruction. +/// +/// \param __i0 +/// A 32-bit integral value used to initialize bits [31:0] of the result. +/// \param __i1 +/// A 32-bit integral value used to initialize bits [63:32] of the result. +/// \param __i2 +/// A 32-bit integral value used to initialize bits [95:64] of the result. +/// \param __i3 +/// A 32-bit integral value used to initialize bits [127:96] of the result. +/// \param __i4 +/// A 32-bit integral value used to initialize bits [159:128] of the result. +/// \param __i5 +/// A 32-bit integral value used to initialize bits [191:160] of the result. +/// \param __i6 +/// A 32-bit integral value used to initialize bits [223:192] of the result. +/// \param __i7 +/// A 32-bit integral value used to initialize bits [255:224] of the result. +/// \returns An initialized 256-bit integer vector. +static __inline __m256i __DEFAULT_FN_ATTRS +_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3, + int __i4, int __i5, int __i6, int __i7) +{ + return _mm256_set_epi32(__i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0); +} + +/// Constructs a 256-bit integer vector, initialized in reverse order +/// with the specified 16-bit integral values. +/// +/// \headerfile +/// +/// This intrinsic is a utility function and does not correspond to a specific +/// instruction. +/// +/// \param __w15 +/// A 16-bit integral value used to initialize bits [15:0] of the result. +/// \param __w14 +/// A 16-bit integral value used to initialize bits [31:16] of the result. +/// \param __w13 +/// A 16-bit integral value used to initialize bits [47:32] of the result. +/// \param __w12 +/// A 16-bit integral value used to initialize bits [63:48] of the result. +/// \param __w11 +/// A 16-bit integral value used to initialize bits [79:64] of the result. +/// \param __w10 +/// A 16-bit integral value used to initialize bits [95:80] of the result. +/// \param __w09 +/// A 16-bit integral value used to initialize bits [111:96] of the result. +/// \param __w08 +/// A 16-bit integral value used to initialize bits [127:112] of the result. +/// \param __w07 +/// A 16-bit integral value used to initialize bits [143:128] of the result. +/// \param __w06 +/// A 16-bit integral value used to initialize bits [159:144] of the result. +/// \param __w05 +/// A 16-bit integral value used to initialize bits [175:160] of the result. +/// \param __w04 +/// A 16-bit integral value used to initialize bits [191:176] of the result. +/// \param __w03 +/// A 16-bit integral value used to initialize bits [207:192] of the result. +/// \param __w02 +/// A 16-bit integral value used to initialize bits [223:208] of the result. +/// \param __w01 +/// A 16-bit integral value used to initialize bits [239:224] of the result. +/// \param __w00 +/// A 16-bit integral value used to initialize bits [255:240] of the result. +/// \returns An initialized 256-bit integer vector. +static __inline __m256i __DEFAULT_FN_ATTRS +_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12, + short __w11, short __w10, short __w09, short __w08, + short __w07, short __w06, short __w05, short __w04, + short __w03, short __w02, short __w01, short __w00) +{ + return _mm256_set_epi16(__w00, __w01, __w02, __w03, + __w04, __w05, __w06, __w07, + __w08, __w09, __w10, __w11, + __w12, __w13, __w14, __w15); +} + +/// Constructs a 256-bit integer vector, initialized in reverse order +/// with the specified 8-bit integral values. +/// +/// \headerfile +/// +/// This intrinsic is a utility function and does not correspond to a specific +/// instruction. +/// +/// \param __b31 +/// An 8-bit integral value used to initialize bits [7:0] of the result. +/// \param __b30 +/// An 8-bit integral value used to initialize bits [15:8] of the result. +/// \param __b29 +/// An 8-bit integral value used to initialize bits [23:16] of the result. +/// \param __b28 +/// An 8-bit integral value used to initialize bits [31:24] of the result. +/// \param __b27 +/// An 8-bit integral value used to initialize bits [39:32] of the result. +/// \param __b26 +/// An 8-bit integral value used to initialize bits [47:40] of the result. +/// \param __b25 +/// An 8-bit integral value used to initialize bits [55:48] of the result. +/// \param __b24 +/// An 8-bit integral value used to initialize bits [63:56] of the result. +/// \param __b23 +/// An 8-bit integral value used to initialize bits [71:64] of the result. +/// \param __b22 +/// An 8-bit integral value used to initialize bits [79:72] of the result. +/// \param __b21 +/// An 8-bit integral value used to initialize bits [87:80] of the result. +/// \param __b20 +/// An 8-bit integral value used to initialize bits [95:88] of the result. +/// \param __b19 +/// An 8-bit integral value used to initialize bits [103:96] of the result. +/// \param __b18 +/// An 8-bit integral value used to initialize bits [111:104] of the result. +/// \param __b17 +/// An 8-bit integral value used to initialize bits [119:112] of the result. +/// \param __b16 +/// An 8-bit integral value used to initialize bits [127:120] of the result. +/// \param __b15 +/// An 8-bit integral value used to initialize bits [135:128] of the result. +/// \param __b14 +/// An 8-bit integral value used to initialize bits [143:136] of the result. +/// \param __b13 +/// An 8-bit integral value used to initialize bits [151:144] of the result. +/// \param __b12 +/// An 8-bit integral value used to initialize bits [159:152] of the result. +/// \param __b11 +/// An 8-bit integral value used to initialize bits [167:160] of the result. +/// \param __b10 +/// An 8-bit integral value used to initialize bits [175:168] of the result. +/// \param __b09 +/// An 8-bit integral value used to initialize bits [183:176] of the result. +/// \param __b08 +/// An 8-bit integral value used to initialize bits [191:184] of the result. +/// \param __b07 +/// An 8-bit integral value used to initialize bits [199:192] of the result. +/// \param __b06 +/// An 8-bit integral value used to initialize bits [207:200] of the result. +/// \param __b05 +/// An 8-bit integral value used to initialize bits [215:208] of the result. +/// \param __b04 +/// An 8-bit integral value used to initialize bits [223:216] of the result. +/// \param __b03 +/// An 8-bit integral value used to initialize bits [231:224] of the result. +/// \param __b02 +/// An 8-bit integral value used to initialize bits [239:232] of the result. +/// \param __b01 +/// An 8-bit integral value used to initialize bits [247:240] of the result. +/// \param __b00 +/// An 8-bit integral value used to initialize bits [255:248] of the result. +/// \returns An initialized 256-bit integer vector. +static __inline __m256i __DEFAULT_FN_ATTRS +_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28, + char __b27, char __b26, char __b25, char __b24, + char __b23, char __b22, char __b21, char __b20, + char __b19, char __b18, char __b17, char __b16, + char __b15, char __b14, char __b13, char __b12, + char __b11, char __b10, char __b09, char __b08, + char __b07, char __b06, char __b05, char __b04, + char __b03, char __b02, char __b01, char __b00) +{ + return _mm256_set_epi8(__b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07, + __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15, + __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23, + __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31); +} + +/// Constructs a 256-bit integer vector, initialized in reverse order +/// with the specified 64-bit integral values. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPUNPCKLQDQ+VINSERTF128 +/// instruction. +/// +/// \param __a +/// A 64-bit integral value used to initialize bits [63:0] of the result. +/// \param __b +/// A 64-bit integral value used to initialize bits [127:64] of the result. +/// \param __c +/// A 64-bit integral value used to initialize bits [191:128] of the result. +/// \param __d +/// A 64-bit integral value used to initialize bits [255:192] of the result. +/// \returns An initialized 256-bit integer vector. +static __inline __m256i __DEFAULT_FN_ATTRS +_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d) +{ + return _mm256_set_epi64x(__d, __c, __b, __a); +} + +/* Create vectors with repeated elements */ +/// Constructs a 256-bit floating-point vector of [4 x double], with each +/// of the four double-precision floating-point vector elements set to the +/// specified double-precision floating-point value. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVDDUP+VINSERTF128 instruction. +/// +/// \param __w +/// A double-precision floating-point value used to initialize each vector +/// element of the result. +/// \returns An initialized 256-bit floating-point vector of [4 x double]. +static __inline __m256d __DEFAULT_FN_ATTRS +_mm256_set1_pd(double __w) +{ + return _mm256_set_pd(__w, __w, __w, __w); +} + +/// Constructs a 256-bit floating-point vector of [8 x float], with each +/// of the eight single-precision floating-point vector elements set to the +/// specified single-precision floating-point value. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPERMILPS+VINSERTF128 +/// instruction. +/// +/// \param __w +/// A single-precision floating-point value used to initialize each vector +/// element of the result. +/// \returns An initialized 256-bit floating-point vector of [8 x float]. +static __inline __m256 __DEFAULT_FN_ATTRS +_mm256_set1_ps(float __w) +{ + return _mm256_set_ps(__w, __w, __w, __w, __w, __w, __w, __w); +} + +/// Constructs a 256-bit integer vector of [8 x i32], with each of the +/// 32-bit integral vector elements set to the specified 32-bit integral +/// value. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPERMILPS+VINSERTF128 +/// instruction. +/// +/// \param __i +/// A 32-bit integral value used to initialize each vector element of the +/// result. +/// \returns An initialized 256-bit integer vector of [8 x i32]. +static __inline __m256i __DEFAULT_FN_ATTRS +_mm256_set1_epi32(int __i) +{ + return _mm256_set_epi32(__i, __i, __i, __i, __i, __i, __i, __i); +} + +/// Constructs a 256-bit integer vector of [16 x i16], with each of the +/// 16-bit integral vector elements set to the specified 16-bit integral +/// value. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPSHUFB+VINSERTF128 instruction. +/// +/// \param __w +/// A 16-bit integral value used to initialize each vector element of the +/// result. +/// \returns An initialized 256-bit integer vector of [16 x i16]. +static __inline __m256i __DEFAULT_FN_ATTRS +_mm256_set1_epi16(short __w) +{ + return _mm256_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w, + __w, __w, __w, __w, __w, __w, __w, __w); +} + +/// Constructs a 256-bit integer vector of [32 x i8], with each of the +/// 8-bit integral vector elements set to the specified 8-bit integral value. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPSHUFB+VINSERTF128 instruction. +/// +/// \param __b +/// An 8-bit integral value used to initialize each vector element of the +/// result. +/// \returns An initialized 256-bit integer vector of [32 x i8]. +static __inline __m256i __DEFAULT_FN_ATTRS +_mm256_set1_epi8(char __b) +{ + return _mm256_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b, + __b, __b, __b, __b, __b, __b, __b, __b, + __b, __b, __b, __b, __b, __b, __b, __b, + __b, __b, __b, __b, __b, __b, __b, __b); +} + +/// Constructs a 256-bit integer vector of [4 x i64], with each of the +/// 64-bit integral vector elements set to the specified 64-bit integral +/// value. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVDDUP+VINSERTF128 instruction. +/// +/// \param __q +/// A 64-bit integral value used to initialize each vector element of the +/// result. +/// \returns An initialized 256-bit integer vector of [4 x i64]. +static __inline __m256i __DEFAULT_FN_ATTRS +_mm256_set1_epi64x(long long __q) +{ + return _mm256_set_epi64x(__q, __q, __q, __q); +} + +/* Create __zeroed vectors */ +/// Constructs a 256-bit floating-point vector of [4 x double] with all +/// vector elements initialized to zero. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VXORPS instruction. +/// +/// \returns A 256-bit vector of [4 x double] with all elements set to zero. +static __inline __m256d __DEFAULT_FN_ATTRS +_mm256_setzero_pd(void) +{ + return __extension__ (__m256d){ 0, 0, 0, 0 }; +} + +/// Constructs a 256-bit floating-point vector of [8 x float] with all +/// vector elements initialized to zero. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VXORPS instruction. +/// +/// \returns A 256-bit vector of [8 x float] with all elements set to zero. +static __inline __m256 __DEFAULT_FN_ATTRS +_mm256_setzero_ps(void) +{ + return __extension__ (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 }; +} + +/// Constructs a 256-bit integer vector initialized to zero. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VXORPS instruction. +/// +/// \returns A 256-bit integer vector initialized to zero. +static __inline __m256i __DEFAULT_FN_ATTRS +_mm256_setzero_si256(void) +{ + return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 }; +} + +/* Cast between vector types */ +/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit +/// floating-point vector of [8 x float]. +/// +/// \headerfile +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 256-bit floating-point vector of [4 x double]. +/// \returns A 256-bit floating-point vector of [8 x float] containing the same +/// bitwise pattern as the parameter. +static __inline __m256 __DEFAULT_FN_ATTRS +_mm256_castpd_ps(__m256d __a) +{ + return (__m256)__a; +} + +/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit +/// integer vector. +/// +/// \headerfile +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 256-bit floating-point vector of [4 x double]. +/// \returns A 256-bit integer vector containing the same bitwise pattern as the +/// parameter. +static __inline __m256i __DEFAULT_FN_ATTRS +_mm256_castpd_si256(__m256d __a) +{ + return (__m256i)__a; +} + +/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit +/// floating-point vector of [4 x double]. +/// +/// \headerfile +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 256-bit floating-point vector of [8 x float]. +/// \returns A 256-bit floating-point vector of [4 x double] containing the same +/// bitwise pattern as the parameter. +static __inline __m256d __DEFAULT_FN_ATTRS +_mm256_castps_pd(__m256 __a) +{ + return (__m256d)__a; +} + +/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit +/// integer vector. +/// +/// \headerfile +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 256-bit floating-point vector of [8 x float]. +/// \returns A 256-bit integer vector containing the same bitwise pattern as the +/// parameter. +static __inline __m256i __DEFAULT_FN_ATTRS +_mm256_castps_si256(__m256 __a) +{ + return (__m256i)__a; +} + +/// Casts a 256-bit integer vector into a 256-bit floating-point vector +/// of [8 x float]. +/// +/// \headerfile +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 256-bit integer vector. +/// \returns A 256-bit floating-point vector of [8 x float] containing the same +/// bitwise pattern as the parameter. +static __inline __m256 __DEFAULT_FN_ATTRS +_mm256_castsi256_ps(__m256i __a) +{ + return (__m256)__a; +} + +/// Casts a 256-bit integer vector into a 256-bit floating-point vector +/// of [4 x double]. +/// +/// \headerfile +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 256-bit integer vector. +/// \returns A 256-bit floating-point vector of [4 x double] containing the same +/// bitwise pattern as the parameter. +static __inline __m256d __DEFAULT_FN_ATTRS +_mm256_castsi256_pd(__m256i __a) +{ + return (__m256d)__a; +} + +/// Returns the lower 128 bits of a 256-bit floating-point vector of +/// [4 x double] as a 128-bit floating-point vector of [2 x double]. +/// +/// \headerfile +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 256-bit floating-point vector of [4 x double]. +/// \returns A 128-bit floating-point vector of [2 x double] containing the +/// lower 128 bits of the parameter. +static __inline __m128d __DEFAULT_FN_ATTRS +_mm256_castpd256_pd128(__m256d __a) +{ + return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1); +} + +/// Returns the lower 128 bits of a 256-bit floating-point vector of +/// [8 x float] as a 128-bit floating-point vector of [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 256-bit floating-point vector of [8 x float]. +/// \returns A 128-bit floating-point vector of [4 x float] containing the +/// lower 128 bits of the parameter. +static __inline __m128 __DEFAULT_FN_ATTRS +_mm256_castps256_ps128(__m256 __a) +{ + return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3); +} + +/// Truncates a 256-bit integer vector into a 128-bit integer vector. +/// +/// \headerfile +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 256-bit integer vector. +/// \returns A 128-bit integer vector containing the lower 128 bits of the +/// parameter. +static __inline __m128i __DEFAULT_FN_ATTRS +_mm256_castsi256_si128(__m256i __a) +{ + return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1); +} + +/// Constructs a 256-bit floating-point vector of [4 x double] from a +/// 128-bit floating-point vector of [2 x double]. +/// +/// The lower 128 bits contain the value of the source vector. The contents +/// of the upper 128 bits are undefined. +/// +/// \headerfile +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. +/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits +/// contain the value of the parameter. The contents of the upper 128 bits +/// are undefined. +static __inline __m256d __DEFAULT_FN_ATTRS +_mm256_castpd128_pd256(__m128d __a) +{ + return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 1, -1, -1); +} + +/// Constructs a 256-bit floating-point vector of [8 x float] from a +/// 128-bit floating-point vector of [4 x float]. +/// +/// The lower 128 bits contain the value of the source vector. The contents +/// of the upper 128 bits are undefined. +/// +/// \headerfile +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. +/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits +/// contain the value of the parameter. The contents of the upper 128 bits +/// are undefined. +static __inline __m256 __DEFAULT_FN_ATTRS +_mm256_castps128_ps256(__m128 __a) +{ + return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1, 2, 3, -1, -1, -1, -1); +} + +/// Constructs a 256-bit integer vector from a 128-bit integer vector. +/// +/// The lower 128 bits contain the value of the source vector. The contents +/// of the upper 128 bits are undefined. +/// +/// \headerfile +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 128-bit integer vector. +/// \returns A 256-bit integer vector. The lower 128 bits contain the value of +/// the parameter. The contents of the upper 128 bits are undefined. +static __inline __m256i __DEFAULT_FN_ATTRS +_mm256_castsi128_si256(__m128i __a) +{ + return __builtin_shufflevector((__v2di)__a, (__v2di)__a, 0, 1, -1, -1); +} + +/// Constructs a 256-bit floating-point vector of [4 x double] from a +/// 128-bit floating-point vector of [2 x double]. The lower 128 bits +/// contain the value of the source vector. The upper 128 bits are set +/// to zero. +/// +/// \headerfile +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. +/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits +/// contain the value of the parameter. The upper 128 bits are set to zero. +static __inline __m256d __DEFAULT_FN_ATTRS +_mm256_zextpd128_pd256(__m128d __a) +{ + return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3); +} + +/// Constructs a 256-bit floating-point vector of [8 x float] from a +/// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain +/// the value of the source vector. The upper 128 bits are set to zero. +/// +/// \headerfile +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. +/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits +/// contain the value of the parameter. The upper 128 bits are set to zero. +static __inline __m256 __DEFAULT_FN_ATTRS +_mm256_zextps128_ps256(__m128 __a) +{ + return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7); +} + +/// Constructs a 256-bit integer vector from a 128-bit integer vector. +/// The lower 128 bits contain the value of the source vector. The upper +/// 128 bits are set to zero. +/// +/// \headerfile +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 128-bit integer vector. +/// \returns A 256-bit integer vector. The lower 128 bits contain the value of +/// the parameter. The upper 128 bits are set to zero. +static __inline __m256i __DEFAULT_FN_ATTRS +_mm256_zextsi128_si256(__m128i __a) +{ + return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3); +} + +/* + Vector insert. + We use macros rather than inlines because we only want to accept + invocations where the immediate M is a constant expression. +*/ +/// Constructs a new 256-bit vector of [8 x float] by first duplicating +/// a 256-bit vector of [8 x float] given in the first parameter, and then +/// replacing either the upper or the lower 128 bits with the contents of a +/// 128-bit vector of [4 x float] in the second parameter. +/// +/// The immediate integer parameter determines between the upper or the lower +/// 128 bits. +/// +/// \headerfile +/// +/// \code +/// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the VINSERTF128 instruction. +/// +/// \param V1 +/// A 256-bit vector of [8 x float]. This vector is copied to the result +/// first, and then either the upper or the lower 128 bits of the result will +/// be replaced by the contents of \a V2. +/// \param V2 +/// A 128-bit vector of [4 x float]. The contents of this parameter are +/// written to either the upper or the lower 128 bits of the result depending +/// on the value of parameter \a M. +/// \param M +/// An immediate integer. The least significant bit determines how the values +/// from the two parameters are interleaved: \n +/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result, +/// and bits [255:128] of \a V1 are copied to bits [255:128] of the +/// result. \n +/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the +/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the +/// result. +/// \returns A 256-bit vector of [8 x float] containing the interleaved values. +#define _mm256_insertf128_ps(V1, V2, M) \ + ((__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)(__m256)(V1), \ + (__v4sf)(__m128)(V2), (int)(M))) + +/// Constructs a new 256-bit vector of [4 x double] by first duplicating +/// a 256-bit vector of [4 x double] given in the first parameter, and then +/// replacing either the upper or the lower 128 bits with the contents of a +/// 128-bit vector of [2 x double] in the second parameter. +/// +/// The immediate integer parameter determines between the upper or the lower +/// 128 bits. +/// +/// \headerfile +/// +/// \code +/// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the VINSERTF128 instruction. +/// +/// \param V1 +/// A 256-bit vector of [4 x double]. This vector is copied to the result +/// first, and then either the upper or the lower 128 bits of the result will +/// be replaced by the contents of \a V2. +/// \param V2 +/// A 128-bit vector of [2 x double]. The contents of this parameter are +/// written to either the upper or the lower 128 bits of the result depending +/// on the value of parameter \a M. +/// \param M +/// An immediate integer. The least significant bit determines how the values +/// from the two parameters are interleaved: \n +/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result, +/// and bits [255:128] of \a V1 are copied to bits [255:128] of the +/// result. \n +/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the +/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the +/// result. +/// \returns A 256-bit vector of [4 x double] containing the interleaved values. +#define _mm256_insertf128_pd(V1, V2, M) \ + ((__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)(__m256d)(V1), \ + (__v2df)(__m128d)(V2), (int)(M))) + +/// Constructs a new 256-bit integer vector by first duplicating a +/// 256-bit integer vector given in the first parameter, and then replacing +/// either the upper or the lower 128 bits with the contents of a 128-bit +/// integer vector in the second parameter. +/// +/// The immediate integer parameter determines between the upper or the lower +/// 128 bits. +/// +/// \headerfile +/// +/// \code +/// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the VINSERTF128 instruction. +/// +/// \param V1 +/// A 256-bit integer vector. This vector is copied to the result first, and +/// then either the upper or the lower 128 bits of the result will be +/// replaced by the contents of \a V2. +/// \param V2 +/// A 128-bit integer vector. The contents of this parameter are written to +/// either the upper or the lower 128 bits of the result depending on the +/// value of parameter \a M. +/// \param M +/// An immediate integer. The least significant bit determines how the values +/// from the two parameters are interleaved: \n +/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result, +/// and bits [255:128] of \a V1 are copied to bits [255:128] of the +/// result. \n +/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the +/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the +/// result. +/// \returns A 256-bit integer vector containing the interleaved values. +#define _mm256_insertf128_si256(V1, V2, M) \ + ((__m256i)__builtin_ia32_vinsertf128_si256((__v8si)(__m256i)(V1), \ + (__v4si)(__m128i)(V2), (int)(M))) + +/* + Vector extract. + We use macros rather than inlines because we only want to accept + invocations where the immediate M is a constant expression. +*/ +/// Extracts either the upper or the lower 128 bits from a 256-bit vector +/// of [8 x float], as determined by the immediate integer parameter, and +/// returns the extracted bits as a 128-bit vector of [4 x float]. +/// +/// \headerfile +/// +/// \code +/// __m128 _mm256_extractf128_ps(__m256 V, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the VEXTRACTF128 instruction. +/// +/// \param V +/// A 256-bit vector of [8 x float]. +/// \param M +/// An immediate integer. The least significant bit determines which bits are +/// extracted from the first parameter: \n +/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the +/// result. \n +/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result. +/// \returns A 128-bit vector of [4 x float] containing the extracted bits. +#define _mm256_extractf128_ps(V, M) \ + ((__m128)__builtin_ia32_vextractf128_ps256((__v8sf)(__m256)(V), (int)(M))) + +/// Extracts either the upper or the lower 128 bits from a 256-bit vector +/// of [4 x double], as determined by the immediate integer parameter, and +/// returns the extracted bits as a 128-bit vector of [2 x double]. +/// +/// \headerfile +/// +/// \code +/// __m128d _mm256_extractf128_pd(__m256d V, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the VEXTRACTF128 instruction. +/// +/// \param V +/// A 256-bit vector of [4 x double]. +/// \param M +/// An immediate integer. The least significant bit determines which bits are +/// extracted from the first parameter: \n +/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the +/// result. \n +/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result. +/// \returns A 128-bit vector of [2 x double] containing the extracted bits. +#define _mm256_extractf128_pd(V, M) \ + ((__m128d)__builtin_ia32_vextractf128_pd256((__v4df)(__m256d)(V), (int)(M))) + +/// Extracts either the upper or the lower 128 bits from a 256-bit +/// integer vector, as determined by the immediate integer parameter, and +/// returns the extracted bits as a 128-bit integer vector. +/// +/// \headerfile +/// +/// \code +/// __m128i _mm256_extractf128_si256(__m256i V, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the VEXTRACTF128 instruction. +/// +/// \param V +/// A 256-bit integer vector. +/// \param M +/// An immediate integer. The least significant bit determines which bits are +/// extracted from the first parameter: \n +/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the +/// result. \n +/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result. +/// \returns A 128-bit integer vector containing the extracted bits. +#define _mm256_extractf128_si256(V, M) \ + ((__m128i)__builtin_ia32_vextractf128_si256((__v8si)(__m256i)(V), (int)(M))) + +/// Constructs a 256-bit floating-point vector of [8 x float] by +/// concatenating two 128-bit floating-point vectors of [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VINSERTF128 instruction. +/// +/// \param __hi +/// A 128-bit floating-point vector of [4 x float] to be copied to the upper +/// 128 bits of the result. +/// \param __lo +/// A 128-bit floating-point vector of [4 x float] to be copied to the lower +/// 128 bits of the result. +/// \returns A 256-bit floating-point vector of [8 x float] containing the +/// concatenated result. +static __inline __m256 __DEFAULT_FN_ATTRS +_mm256_set_m128 (__m128 __hi, __m128 __lo) +{ + return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7); +} + +/// Constructs a 256-bit floating-point vector of [4 x double] by +/// concatenating two 128-bit floating-point vectors of [2 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VINSERTF128 instruction. +/// +/// \param __hi +/// A 128-bit floating-point vector of [2 x double] to be copied to the upper +/// 128 bits of the result. +/// \param __lo +/// A 128-bit floating-point vector of [2 x double] to be copied to the lower +/// 128 bits of the result. +/// \returns A 256-bit floating-point vector of [4 x double] containing the +/// concatenated result. +static __inline __m256d __DEFAULT_FN_ATTRS +_mm256_set_m128d (__m128d __hi, __m128d __lo) +{ + return (__m256d) __builtin_shufflevector((__v2df)__lo, (__v2df)__hi, 0, 1, 2, 3); +} + +/// Constructs a 256-bit integer vector by concatenating two 128-bit +/// integer vectors. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VINSERTF128 instruction. +/// +/// \param __hi +/// A 128-bit integer vector to be copied to the upper 128 bits of the +/// result. +/// \param __lo +/// A 128-bit integer vector to be copied to the lower 128 bits of the +/// result. +/// \returns A 256-bit integer vector containing the concatenated result. +static __inline __m256i __DEFAULT_FN_ATTRS +_mm256_set_m128i (__m128i __hi, __m128i __lo) +{ + return (__m256i) __builtin_shufflevector((__v2di)__lo, (__v2di)__hi, 0, 1, 2, 3); +} + +/// Constructs a 256-bit floating-point vector of [8 x float] by +/// concatenating two 128-bit floating-point vectors of [4 x float]. This is +/// similar to _mm256_set_m128, but the order of the input parameters is +/// swapped. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VINSERTF128 instruction. +/// +/// \param __lo +/// A 128-bit floating-point vector of [4 x float] to be copied to the lower +/// 128 bits of the result. +/// \param __hi +/// A 128-bit floating-point vector of [4 x float] to be copied to the upper +/// 128 bits of the result. +/// \returns A 256-bit floating-point vector of [8 x float] containing the +/// concatenated result. +static __inline __m256 __DEFAULT_FN_ATTRS +_mm256_setr_m128 (__m128 __lo, __m128 __hi) +{ + return _mm256_set_m128(__hi, __lo); +} + +/// Constructs a 256-bit floating-point vector of [4 x double] by +/// concatenating two 128-bit floating-point vectors of [2 x double]. This is +/// similar to _mm256_set_m128d, but the order of the input parameters is +/// swapped. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VINSERTF128 instruction. +/// +/// \param __lo +/// A 128-bit floating-point vector of [2 x double] to be copied to the lower +/// 128 bits of the result. +/// \param __hi +/// A 128-bit floating-point vector of [2 x double] to be copied to the upper +/// 128 bits of the result. +/// \returns A 256-bit floating-point vector of [4 x double] containing the +/// concatenated result. +static __inline __m256d __DEFAULT_FN_ATTRS +_mm256_setr_m128d (__m128d __lo, __m128d __hi) +{ + return (__m256d)_mm256_set_m128d(__hi, __lo); +} + +/// Constructs a 256-bit integer vector by concatenating two 128-bit +/// integer vectors. This is similar to _mm256_set_m128i, but the order of +/// the input parameters is swapped. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VINSERTF128 instruction. +/// +/// \param __lo +/// A 128-bit integer vector to be copied to the lower 128 bits of the +/// result. +/// \param __hi +/// A 128-bit integer vector to be copied to the upper 128 bits of the +/// result. +/// \returns A 256-bit integer vector containing the concatenated result. +static __inline __m256i __DEFAULT_FN_ATTRS +_mm256_setr_m128i (__m128i __lo, __m128i __hi) +{ + return (__m256i)_mm256_set_m128i(__hi, __lo); +} + +/* SIMD load ops (unaligned) */ +/// Loads two 128-bit floating-point vectors of [4 x float] from +/// unaligned memory locations and constructs a 256-bit floating-point vector +/// of [8 x float] by concatenating the two 128-bit vectors. +/// +/// \headerfile +/// +/// This intrinsic corresponds to load instructions followed by the +/// VINSERTF128 instruction. +/// +/// \param __addr_hi +/// A pointer to a 128-bit memory location containing 4 consecutive +/// single-precision floating-point values. These values are to be copied to +/// bits[255:128] of the result. The address of the memory location does not +/// have to be aligned. +/// \param __addr_lo +/// A pointer to a 128-bit memory location containing 4 consecutive +/// single-precision floating-point values. These values are to be copied to +/// bits[127:0] of the result. The address of the memory location does not +/// have to be aligned. +/// \returns A 256-bit floating-point vector of [8 x float] containing the +/// concatenated result. +static __inline __m256 __DEFAULT_FN_ATTRS +_mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo) +{ + return _mm256_set_m128(_mm_loadu_ps(__addr_hi), _mm_loadu_ps(__addr_lo)); +} + +/// Loads two 128-bit floating-point vectors of [2 x double] from +/// unaligned memory locations and constructs a 256-bit floating-point vector +/// of [4 x double] by concatenating the two 128-bit vectors. +/// +/// \headerfile +/// +/// This intrinsic corresponds to load instructions followed by the +/// VINSERTF128 instruction. +/// +/// \param __addr_hi +/// A pointer to a 128-bit memory location containing two consecutive +/// double-precision floating-point values. These values are to be copied to +/// bits[255:128] of the result. The address of the memory location does not +/// have to be aligned. +/// \param __addr_lo +/// A pointer to a 128-bit memory location containing two consecutive +/// double-precision floating-point values. These values are to be copied to +/// bits[127:0] of the result. The address of the memory location does not +/// have to be aligned. +/// \returns A 256-bit floating-point vector of [4 x double] containing the +/// concatenated result. +static __inline __m256d __DEFAULT_FN_ATTRS +_mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo) +{ + return _mm256_set_m128d(_mm_loadu_pd(__addr_hi), _mm_loadu_pd(__addr_lo)); +} + +/// Loads two 128-bit integer vectors from unaligned memory locations and +/// constructs a 256-bit integer vector by concatenating the two 128-bit +/// vectors. +/// +/// \headerfile +/// +/// This intrinsic corresponds to load instructions followed by the +/// VINSERTF128 instruction. +/// +/// \param __addr_hi +/// A pointer to a 128-bit memory location containing a 128-bit integer +/// vector. This vector is to be copied to bits[255:128] of the result. The +/// address of the memory location does not have to be aligned. +/// \param __addr_lo +/// A pointer to a 128-bit memory location containing a 128-bit integer +/// vector. This vector is to be copied to bits[127:0] of the result. The +/// address of the memory location does not have to be aligned. +/// \returns A 256-bit integer vector containing the concatenated result. +static __inline __m256i __DEFAULT_FN_ATTRS +_mm256_loadu2_m128i(__m128i_u const *__addr_hi, __m128i_u const *__addr_lo) +{ + return _mm256_set_m128i(_mm_loadu_si128(__addr_hi), _mm_loadu_si128(__addr_lo)); +} + +/* SIMD store ops (unaligned) */ +/// Stores the upper and lower 128 bits of a 256-bit floating-point +/// vector of [8 x float] into two different unaligned memory locations. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VEXTRACTF128 instruction and the +/// store instructions. +/// +/// \param __addr_hi +/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be +/// copied to this memory location. The address of this memory location does +/// not have to be aligned. +/// \param __addr_lo +/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be +/// copied to this memory location. The address of this memory location does +/// not have to be aligned. +/// \param __a +/// A 256-bit floating-point vector of [8 x float]. +static __inline void __DEFAULT_FN_ATTRS +_mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a) +{ + __m128 __v128; + + __v128 = _mm256_castps256_ps128(__a); + _mm_storeu_ps(__addr_lo, __v128); + __v128 = _mm256_extractf128_ps(__a, 1); + _mm_storeu_ps(__addr_hi, __v128); +} + +/// Stores the upper and lower 128 bits of a 256-bit floating-point +/// vector of [4 x double] into two different unaligned memory locations. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VEXTRACTF128 instruction and the +/// store instructions. +/// +/// \param __addr_hi +/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be +/// copied to this memory location. The address of this memory location does +/// not have to be aligned. +/// \param __addr_lo +/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be +/// copied to this memory location. The address of this memory location does +/// not have to be aligned. +/// \param __a +/// A 256-bit floating-point vector of [4 x double]. +static __inline void __DEFAULT_FN_ATTRS +_mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a) +{ + __m128d __v128; + + __v128 = _mm256_castpd256_pd128(__a); + _mm_storeu_pd(__addr_lo, __v128); + __v128 = _mm256_extractf128_pd(__a, 1); + _mm_storeu_pd(__addr_hi, __v128); +} + +/// Stores the upper and lower 128 bits of a 256-bit integer vector into +/// two different unaligned memory locations. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VEXTRACTF128 instruction and the +/// store instructions. +/// +/// \param __addr_hi +/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be +/// copied to this memory location. The address of this memory location does +/// not have to be aligned. +/// \param __addr_lo +/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be +/// copied to this memory location. The address of this memory location does +/// not have to be aligned. +/// \param __a +/// A 256-bit integer vector. +static __inline void __DEFAULT_FN_ATTRS +_mm256_storeu2_m128i(__m128i_u *__addr_hi, __m128i_u *__addr_lo, __m256i __a) +{ + __m128i __v128; + + __v128 = _mm256_castsi256_si128(__a); + _mm_storeu_si128(__addr_lo, __v128); + __v128 = _mm256_extractf128_si256(__a, 1); + _mm_storeu_si128(__addr_hi, __v128); +} + +#undef __DEFAULT_FN_ATTRS +#undef __DEFAULT_FN_ATTRS128 + +#endif /* __AVXINTRIN_H */ diff --git a/include-llvm/avxvnniintrin.h b/include-llvm/avxvnniintrin.h new file mode 100644 index 0000000..ad45cb7 --- /dev/null +++ b/include-llvm/avxvnniintrin.h @@ -0,0 +1,225 @@ +/*===--------------- avxvnniintrin.h - VNNI intrinsics --------------------=== + * + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __AVXVNNIINTRIN_H +#define __AVXVNNIINTRIN_H + +/* Below intrinsics defined in avx512vlvnniintrin.h can be used for AVXVNNI */ +/// \fn __m256i _mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B) +/// \fn __m256i _mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B) +/// \fn __m256i _mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B) +/// \fn __m256i _mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B) +/// \fn __m128i _mm_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B) +/// \fn __m128i _mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B) +/// \fn __m128i _mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B) +/// \fn __m128i _mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B) + +/* Intrinsics with _avx_ prefix are for compatibility with msvc. */ +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avxvnni"), __min_vector_width__(256))) +#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avxvnni"), __min_vector_width__(128))) + +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with +/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed +/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer +/// in \a __S, and store the packed 32-bit results in DST. +/// +/// This intrinsic corresponds to the VPDPBUSD instructions. +/// +/// \operation +/// FOR j := 0 to 7 +/// tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])) +/// tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])) +/// tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])) +/// tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])) +/// DST.dword[j] := __S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 +/// ENDFOR +/// DST[MAX:256] := 0 +/// \endoperation +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_dpbusd_avx_epi32(__m256i __S, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_vpdpbusd256((__v8si)__S, (__v8si)__A, (__v8si)__B); +} + +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with +/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed +/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer +/// in \a __S using signed saturation, and store the packed 32-bit results in DST. +/// +/// This intrinsic corresponds to the VPDPBUSDS instructions. +/// +/// \operation +/// FOR j := 0 to 7 +/// tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])) +/// tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])) +/// tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])) +/// tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])) +/// DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) +/// ENDFOR +/// DST[MAX:256] := 0 +/// \endoperation +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_vpdpbusds256((__v8si)__S, (__v8si)__A, (__v8si)__B); +} + +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with +/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit +/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S, +/// and store the packed 32-bit results in DST. +/// +/// This intrinsic corresponds to the VPDPWSSD instructions. +/// +/// \operation +/// FOR j := 0 to 7 +/// tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j]) +/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1]) +/// DST.dword[j] := __S.dword[j] + tmp1 + tmp2 +/// ENDFOR +/// DST[MAX:256] := 0 +/// \endoperation +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_vpdpwssd256((__v8si)__S, (__v8si)__A, (__v8si)__B); +} + +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with +/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit +/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S +/// using signed saturation, and store the packed 32-bit results in DST. +/// +/// This intrinsic corresponds to the VPDPWSSDS instructions. +/// +/// \operation +/// FOR j := 0 to 7 +/// tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j]) +/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1]) +/// DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2) +/// ENDFOR +/// DST[MAX:256] := 0 +/// \endoperation +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B) +{ + return (__m256i)__builtin_ia32_vpdpwssds256((__v8si)__S, (__v8si)__A, (__v8si)__B); +} + +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with +/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed +/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer +/// in \a __S, and store the packed 32-bit results in DST. +/// +/// This intrinsic corresponds to the VPDPBUSD instructions. +/// +/// \operation +/// FOR j := 0 to 3 +/// tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])) +/// tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])) +/// tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])) +/// tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])) +/// DST.dword[j] := __S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 +/// ENDFOR +/// DST[MAX:128] := 0 +/// \endoperation +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_dpbusd_avx_epi32(__m128i __S, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_vpdpbusd128((__v4si)__S, (__v4si)__A, (__v4si)__B); +} + +/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with +/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed +/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer +/// in \a __S using signed saturation, and store the packed 32-bit results in DST. +/// +/// This intrinsic corresponds to the VPDPBUSDS instructions. +/// +/// \operation +/// FOR j := 0 to 3 +/// tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])) +/// tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])) +/// tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])) +/// tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])) +/// DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) +/// ENDFOR +/// DST[MAX:128] := 0 +/// \endoperation +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_vpdpbusds128((__v4si)__S, (__v4si)__A, (__v4si)__B); +} + +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with +/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit +/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S, +/// and store the packed 32-bit results in DST. +/// +/// This intrinsic corresponds to the VPDPWSSD instructions. +/// +/// \operation +/// FOR j := 0 to 3 +/// tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j]) +/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1]) +/// DST.dword[j] := __S.dword[j] + tmp1 + tmp2 +/// ENDFOR +/// DST[MAX:128] := 0 +/// \endoperation +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_vpdpwssd128((__v4si)__S, (__v4si)__A, (__v4si)__B); +} + +/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with +/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit +/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S +/// using signed saturation, and store the packed 32-bit results in DST. +/// +/// This intrinsic corresponds to the VPDPWSSDS instructions. +/// +/// \operation +/// FOR j := 0 to 3 +/// tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j]) +/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1]) +/// DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2) +/// ENDFOR +/// DST[MAX:128] := 0 +/// \endoperation +static __inline__ __m128i __DEFAULT_FN_ATTRS128 +_mm_dpwssds_avx_epi32(__m128i __S, __m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_vpdpwssds128((__v4si)__S, (__v4si)__A, (__v4si)__B); +} + +#undef __DEFAULT_FN_ATTRS128 +#undef __DEFAULT_FN_ATTRS256 + +#endif // __AVXVNNIINTRIN_H diff --git a/include-llvm/bmi2intrin.h b/include-llvm/bmi2intrin.h new file mode 100644 index 0000000..0b56aed --- /dev/null +++ b/include-llvm/bmi2intrin.h @@ -0,0 +1,81 @@ +/*===---- bmi2intrin.h - BMI2 intrinsics -----------------------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __BMI2INTRIN_H +#define __BMI2INTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("bmi2"))) + +static __inline__ unsigned int __DEFAULT_FN_ATTRS +_bzhi_u32(unsigned int __X, unsigned int __Y) +{ + return __builtin_ia32_bzhi_si(__X, __Y); +} + +static __inline__ unsigned int __DEFAULT_FN_ATTRS +_pdep_u32(unsigned int __X, unsigned int __Y) +{ + return __builtin_ia32_pdep_si(__X, __Y); +} + +static __inline__ unsigned int __DEFAULT_FN_ATTRS +_pext_u32(unsigned int __X, unsigned int __Y) +{ + return __builtin_ia32_pext_si(__X, __Y); +} + +#ifdef __x86_64__ + +static __inline__ unsigned long long __DEFAULT_FN_ATTRS +_bzhi_u64(unsigned long long __X, unsigned long long __Y) +{ + return __builtin_ia32_bzhi_di(__X, __Y); +} + +static __inline__ unsigned long long __DEFAULT_FN_ATTRS +_pdep_u64(unsigned long long __X, unsigned long long __Y) +{ + return __builtin_ia32_pdep_di(__X, __Y); +} + +static __inline__ unsigned long long __DEFAULT_FN_ATTRS +_pext_u64(unsigned long long __X, unsigned long long __Y) +{ + return __builtin_ia32_pext_di(__X, __Y); +} + +static __inline__ unsigned long long __DEFAULT_FN_ATTRS +_mulx_u64 (unsigned long long __X, unsigned long long __Y, + unsigned long long *__P) +{ + unsigned __int128 __res = (unsigned __int128) __X * __Y; + *__P = (unsigned long long) (__res >> 64); + return (unsigned long long) __res; +} + +#else /* !__x86_64__ */ + +static __inline__ unsigned int __DEFAULT_FN_ATTRS +_mulx_u32 (unsigned int __X, unsigned int __Y, unsigned int *__P) +{ + unsigned long long __res = (unsigned long long) __X * __Y; + *__P = (unsigned int) (__res >> 32); + return (unsigned int) __res; +} + +#endif /* !__x86_64__ */ + +#undef __DEFAULT_FN_ATTRS + +#endif /* __BMI2INTRIN_H */ diff --git a/include-llvm/bmiintrin.h b/include-llvm/bmiintrin.h new file mode 100644 index 0000000..f583c21 --- /dev/null +++ b/include-llvm/bmiintrin.h @@ -0,0 +1,427 @@ +/*===---- bmiintrin.h - BMI intrinsics -------------------------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __BMIINTRIN_H +#define __BMIINTRIN_H + +/* Allow using the tzcnt intrinsics even for non-BMI targets. Since the TZCNT + instruction behaves as BSF on non-BMI targets, there is code that expects + to use it as a potentially faster version of BSF. */ +#define __RELAXED_FN_ATTRS __attribute__((__always_inline__, __nodebug__)) + +#define _tzcnt_u16(a) (__tzcnt_u16((a))) + +/// Counts the number of trailing zero bits in the operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TZCNT instruction. +/// +/// \param __X +/// An unsigned 16-bit integer whose trailing zeros are to be counted. +/// \returns An unsigned 16-bit integer containing the number of trailing zero +/// bits in the operand. +static __inline__ unsigned short __RELAXED_FN_ATTRS +__tzcnt_u16(unsigned short __X) +{ + return __builtin_ia32_tzcnt_u16(__X); +} + +/// Counts the number of trailing zero bits in the operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TZCNT instruction. +/// +/// \param __X +/// An unsigned 32-bit integer whose trailing zeros are to be counted. +/// \returns An unsigned 32-bit integer containing the number of trailing zero +/// bits in the operand. +static __inline__ unsigned int __RELAXED_FN_ATTRS +__tzcnt_u32(unsigned int __X) +{ + return __builtin_ia32_tzcnt_u32(__X); +} + +/// Counts the number of trailing zero bits in the operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TZCNT instruction. +/// +/// \param __X +/// An unsigned 32-bit integer whose trailing zeros are to be counted. +/// \returns An 32-bit integer containing the number of trailing zero bits in +/// the operand. +static __inline__ int __RELAXED_FN_ATTRS +_mm_tzcnt_32(unsigned int __X) +{ + return __builtin_ia32_tzcnt_u32(__X); +} + +#define _tzcnt_u32(a) (__tzcnt_u32((a))) + +#ifdef __x86_64__ + +/// Counts the number of trailing zero bits in the operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TZCNT instruction. +/// +/// \param __X +/// An unsigned 64-bit integer whose trailing zeros are to be counted. +/// \returns An unsigned 64-bit integer containing the number of trailing zero +/// bits in the operand. +static __inline__ unsigned long long __RELAXED_FN_ATTRS +__tzcnt_u64(unsigned long long __X) +{ + return __builtin_ia32_tzcnt_u64(__X); +} + +/// Counts the number of trailing zero bits in the operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TZCNT instruction. +/// +/// \param __X +/// An unsigned 64-bit integer whose trailing zeros are to be counted. +/// \returns An 64-bit integer containing the number of trailing zero bits in +/// the operand. +static __inline__ long long __RELAXED_FN_ATTRS +_mm_tzcnt_64(unsigned long long __X) +{ + return __builtin_ia32_tzcnt_u64(__X); +} + +#define _tzcnt_u64(a) (__tzcnt_u64((a))) + +#endif /* __x86_64__ */ + +#undef __RELAXED_FN_ATTRS + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__BMI__) + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("bmi"))) + +#define _andn_u32(a, b) (__andn_u32((a), (b))) + +/* _bextr_u32 != __bextr_u32 */ +#define _blsi_u32(a) (__blsi_u32((a))) + +#define _blsmsk_u32(a) (__blsmsk_u32((a))) + +#define _blsr_u32(a) (__blsr_u32((a))) + +/// Performs a bitwise AND of the second operand with the one's +/// complement of the first operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the ANDN instruction. +/// +/// \param __X +/// An unsigned integer containing one of the operands. +/// \param __Y +/// An unsigned integer containing one of the operands. +/// \returns An unsigned integer containing the bitwise AND of the second +/// operand with the one's complement of the first operand. +static __inline__ unsigned int __DEFAULT_FN_ATTRS +__andn_u32(unsigned int __X, unsigned int __Y) +{ + return ~__X & __Y; +} + +/* AMD-specified, double-leading-underscore version of BEXTR */ +/// Extracts the specified bits from the first operand and returns them +/// in the least significant bits of the result. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the BEXTR instruction. +/// +/// \param __X +/// An unsigned integer whose bits are to be extracted. +/// \param __Y +/// An unsigned integer used to specify which bits are extracted. Bits [7:0] +/// specify the index of the least significant bit. Bits [15:8] specify the +/// number of bits to be extracted. +/// \returns An unsigned integer whose least significant bits contain the +/// extracted bits. +/// \see _bextr_u32 +static __inline__ unsigned int __DEFAULT_FN_ATTRS +__bextr_u32(unsigned int __X, unsigned int __Y) +{ + return __builtin_ia32_bextr_u32(__X, __Y); +} + +/* Intel-specified, single-leading-underscore version of BEXTR */ +/// Extracts the specified bits from the first operand and returns them +/// in the least significant bits of the result. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the BEXTR instruction. +/// +/// \param __X +/// An unsigned integer whose bits are to be extracted. +/// \param __Y +/// An unsigned integer used to specify the index of the least significant +/// bit for the bits to be extracted. Bits [7:0] specify the index. +/// \param __Z +/// An unsigned integer used to specify the number of bits to be extracted. +/// Bits [7:0] specify the number of bits. +/// \returns An unsigned integer whose least significant bits contain the +/// extracted bits. +/// \see __bextr_u32 +static __inline__ unsigned int __DEFAULT_FN_ATTRS +_bextr_u32(unsigned int __X, unsigned int __Y, unsigned int __Z) +{ + return __builtin_ia32_bextr_u32 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8))); +} + +/* Intel-specified, single-leading-underscore version of BEXTR2 */ +/// Extracts the specified bits from the first operand and returns them +/// in the least significant bits of the result. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the BEXTR instruction. +/// +/// \param __X +/// An unsigned integer whose bits are to be extracted. +/// \param __Y +/// An unsigned integer used to specify which bits are extracted. Bits [7:0] +/// specify the index of the least significant bit. Bits [15:8] specify the +/// number of bits to be extracted. +/// \returns An unsigned integer whose least significant bits contain the +/// extracted bits. +/// \see __bextr_u32 +static __inline__ unsigned int __DEFAULT_FN_ATTRS +_bextr2_u32(unsigned int __X, unsigned int __Y) { + return __builtin_ia32_bextr_u32(__X, __Y); +} + +/// Clears all bits in the source except for the least significant bit +/// containing a value of 1 and returns the result. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the BLSI instruction. +/// +/// \param __X +/// An unsigned integer whose bits are to be cleared. +/// \returns An unsigned integer containing the result of clearing the bits from +/// the source operand. +static __inline__ unsigned int __DEFAULT_FN_ATTRS +__blsi_u32(unsigned int __X) +{ + return __X & -__X; +} + +/// Creates a mask whose bits are set to 1, using bit 0 up to and +/// including the least significant bit that is set to 1 in the source +/// operand and returns the result. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the BLSMSK instruction. +/// +/// \param __X +/// An unsigned integer used to create the mask. +/// \returns An unsigned integer containing the newly created mask. +static __inline__ unsigned int __DEFAULT_FN_ATTRS +__blsmsk_u32(unsigned int __X) +{ + return __X ^ (__X - 1); +} + +/// Clears the least significant bit that is set to 1 in the source +/// operand and returns the result. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the BLSR instruction. +/// +/// \param __X +/// An unsigned integer containing the operand to be cleared. +/// \returns An unsigned integer containing the result of clearing the source +/// operand. +static __inline__ unsigned int __DEFAULT_FN_ATTRS +__blsr_u32(unsigned int __X) +{ + return __X & (__X - 1); +} + +#ifdef __x86_64__ + +#define _andn_u64(a, b) (__andn_u64((a), (b))) + +/* _bextr_u64 != __bextr_u64 */ +#define _blsi_u64(a) (__blsi_u64((a))) + +#define _blsmsk_u64(a) (__blsmsk_u64((a))) + +#define _blsr_u64(a) (__blsr_u64((a))) + +/// Performs a bitwise AND of the second operand with the one's +/// complement of the first operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the ANDN instruction. +/// +/// \param __X +/// An unsigned 64-bit integer containing one of the operands. +/// \param __Y +/// An unsigned 64-bit integer containing one of the operands. +/// \returns An unsigned 64-bit integer containing the bitwise AND of the second +/// operand with the one's complement of the first operand. +static __inline__ unsigned long long __DEFAULT_FN_ATTRS +__andn_u64 (unsigned long long __X, unsigned long long __Y) +{ + return ~__X & __Y; +} + +/* AMD-specified, double-leading-underscore version of BEXTR */ +/// Extracts the specified bits from the first operand and returns them +/// in the least significant bits of the result. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the BEXTR instruction. +/// +/// \param __X +/// An unsigned 64-bit integer whose bits are to be extracted. +/// \param __Y +/// An unsigned 64-bit integer used to specify which bits are extracted. Bits +/// [7:0] specify the index of the least significant bit. Bits [15:8] specify +/// the number of bits to be extracted. +/// \returns An unsigned 64-bit integer whose least significant bits contain the +/// extracted bits. +/// \see _bextr_u64 +static __inline__ unsigned long long __DEFAULT_FN_ATTRS +__bextr_u64(unsigned long long __X, unsigned long long __Y) +{ + return __builtin_ia32_bextr_u64(__X, __Y); +} + +/* Intel-specified, single-leading-underscore version of BEXTR */ +/// Extracts the specified bits from the first operand and returns them +/// in the least significant bits of the result. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the BEXTR instruction. +/// +/// \param __X +/// An unsigned 64-bit integer whose bits are to be extracted. +/// \param __Y +/// An unsigned integer used to specify the index of the least significant +/// bit for the bits to be extracted. Bits [7:0] specify the index. +/// \param __Z +/// An unsigned integer used to specify the number of bits to be extracted. +/// Bits [7:0] specify the number of bits. +/// \returns An unsigned 64-bit integer whose least significant bits contain the +/// extracted bits. +/// \see __bextr_u64 +static __inline__ unsigned long long __DEFAULT_FN_ATTRS +_bextr_u64(unsigned long long __X, unsigned int __Y, unsigned int __Z) +{ + return __builtin_ia32_bextr_u64 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8))); +} + +/* Intel-specified, single-leading-underscore version of BEXTR2 */ +/// Extracts the specified bits from the first operand and returns them +/// in the least significant bits of the result. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the BEXTR instruction. +/// +/// \param __X +/// An unsigned 64-bit integer whose bits are to be extracted. +/// \param __Y +/// An unsigned 64-bit integer used to specify which bits are extracted. Bits +/// [7:0] specify the index of the least significant bit. Bits [15:8] specify +/// the number of bits to be extracted. +/// \returns An unsigned 64-bit integer whose least significant bits contain the +/// extracted bits. +/// \see __bextr_u64 +static __inline__ unsigned long long __DEFAULT_FN_ATTRS +_bextr2_u64(unsigned long long __X, unsigned long long __Y) { + return __builtin_ia32_bextr_u64(__X, __Y); +} + +/// Clears all bits in the source except for the least significant bit +/// containing a value of 1 and returns the result. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the BLSI instruction. +/// +/// \param __X +/// An unsigned 64-bit integer whose bits are to be cleared. +/// \returns An unsigned 64-bit integer containing the result of clearing the +/// bits from the source operand. +static __inline__ unsigned long long __DEFAULT_FN_ATTRS +__blsi_u64(unsigned long long __X) +{ + return __X & -__X; +} + +/// Creates a mask whose bits are set to 1, using bit 0 up to and +/// including the least significant bit that is set to 1 in the source +/// operand and returns the result. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the BLSMSK instruction. +/// +/// \param __X +/// An unsigned 64-bit integer used to create the mask. +/// \returns An unsigned 64-bit integer containing the newly created mask. +static __inline__ unsigned long long __DEFAULT_FN_ATTRS +__blsmsk_u64(unsigned long long __X) +{ + return __X ^ (__X - 1); +} + +/// Clears the least significant bit that is set to 1 in the source +/// operand and returns the result. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the BLSR instruction. +/// +/// \param __X +/// An unsigned 64-bit integer containing the operand to be cleared. +/// \returns An unsigned 64-bit integer containing the result of clearing the +/// source operand. +static __inline__ unsigned long long __DEFAULT_FN_ATTRS +__blsr_u64(unsigned long long __X) +{ + return __X & (__X - 1); +} + +#endif /* __x86_64__ */ + +#undef __DEFAULT_FN_ATTRS + +#endif /* !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) \ + || defined(__BMI__) */ + +#endif /* __BMIINTRIN_H */ diff --git a/include-llvm/cetintrin.h b/include-llvm/cetintrin.h new file mode 100644 index 0000000..019cab0 --- /dev/null +++ b/include-llvm/cetintrin.h @@ -0,0 +1,109 @@ +/*===---- cetintrin.h - CET intrinsic --------------------------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __CETINTRIN_H +#define __CETINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("shstk"))) + +static __inline__ void __DEFAULT_FN_ATTRS _incsspd(int __a) { + __builtin_ia32_incsspd(__a); +} + +#ifdef __x86_64__ +static __inline__ void __DEFAULT_FN_ATTRS _incsspq(unsigned long long __a) { + __builtin_ia32_incsspq(__a); +} +#endif /* __x86_64__ */ + +#ifdef __x86_64__ +static __inline__ void __DEFAULT_FN_ATTRS _inc_ssp(unsigned int __a) { + __builtin_ia32_incsspq(__a); +} +#else /* __x86_64__ */ +static __inline__ void __DEFAULT_FN_ATTRS _inc_ssp(unsigned int __a) { + __builtin_ia32_incsspd((int)__a); +} +#endif /* __x86_64__ */ + +static __inline__ unsigned int __DEFAULT_FN_ATTRS _rdsspd(unsigned int __a) { + return __builtin_ia32_rdsspd(__a); +} + +static __inline__ unsigned int __DEFAULT_FN_ATTRS _rdsspd_i32() { + unsigned int t; + return __builtin_ia32_rdsspd(t); +} + +#ifdef __x86_64__ +static __inline__ unsigned long long __DEFAULT_FN_ATTRS _rdsspq(unsigned long long __a) { + return __builtin_ia32_rdsspq(__a); +} + +static __inline__ unsigned long long __DEFAULT_FN_ATTRS _rdsspq_i64() { + unsigned long long t; + return __builtin_ia32_rdsspq(t); +} +#endif /* __x86_64__ */ + +#ifdef __x86_64__ +static __inline__ unsigned long long __DEFAULT_FN_ATTRS _get_ssp(void) { + return __builtin_ia32_rdsspq(0); +} +#else /* __x86_64__ */ +static __inline__ unsigned int __DEFAULT_FN_ATTRS _get_ssp(void) { + return __builtin_ia32_rdsspd(0); +} +#endif /* __x86_64__ */ + +static __inline__ void __DEFAULT_FN_ATTRS _saveprevssp() { + __builtin_ia32_saveprevssp(); +} + +static __inline__ void __DEFAULT_FN_ATTRS _rstorssp(void * __p) { + __builtin_ia32_rstorssp(__p); +} + +static __inline__ void __DEFAULT_FN_ATTRS _wrssd(unsigned int __a, void * __p) { + __builtin_ia32_wrssd(__a, __p); +} + +#ifdef __x86_64__ +static __inline__ void __DEFAULT_FN_ATTRS _wrssq(unsigned long long __a, void * __p) { + __builtin_ia32_wrssq(__a, __p); +} +#endif /* __x86_64__ */ + +static __inline__ void __DEFAULT_FN_ATTRS _wrussd(unsigned int __a, void * __p) { + __builtin_ia32_wrussd(__a, __p); +} + +#ifdef __x86_64__ +static __inline__ void __DEFAULT_FN_ATTRS _wrussq(unsigned long long __a, void * __p) { + __builtin_ia32_wrussq(__a, __p); +} +#endif /* __x86_64__ */ + +static __inline__ void __DEFAULT_FN_ATTRS _setssbsy() { + __builtin_ia32_setssbsy(); +} + +static __inline__ void __DEFAULT_FN_ATTRS _clrssbsy(void * __p) { + __builtin_ia32_clrssbsy(__p); +} + +#undef __DEFAULT_FN_ATTRS + +#endif /* __CETINTRIN_H */ diff --git a/include-llvm/cldemoteintrin.h b/include-llvm/cldemoteintrin.h new file mode 100644 index 0000000..cfb951c --- /dev/null +++ b/include-llvm/cldemoteintrin.h @@ -0,0 +1,36 @@ +/*===---- cldemoteintrin.h - CLDEMOTE intrinsic ----------------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __CLDEMOTEINTRIN_H +#define __CLDEMOTEINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("cldemote"))) + +/// Hint to hardware that the cache line that contains \p __P should be demoted +/// from the cache closest to the processor core to a level more distant from +/// the processor core. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the CLDEMOTE instruction. +static __inline__ void __DEFAULT_FN_ATTRS +_cldemote(const void * __P) { + __builtin_ia32_cldemote(__P); +} + +#define _mm_cldemote(p) _cldemote(p) +#undef __DEFAULT_FN_ATTRS + +#endif diff --git a/include-llvm/clflushoptintrin.h b/include-llvm/clflushoptintrin.h new file mode 100644 index 0000000..060eb36 --- /dev/null +++ b/include-llvm/clflushoptintrin.h @@ -0,0 +1,27 @@ +/*===---- clflushoptintrin.h - CLFLUSHOPT intrinsic ------------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __CLFLUSHOPTINTRIN_H +#define __CLFLUSHOPTINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("clflushopt"))) + +static __inline__ void __DEFAULT_FN_ATTRS +_mm_clflushopt(void const * __m) { + __builtin_ia32_clflushopt(__m); +} + +#undef __DEFAULT_FN_ATTRS + +#endif diff --git a/include-llvm/clwbintrin.h b/include-llvm/clwbintrin.h new file mode 100644 index 0000000..3360d20 --- /dev/null +++ b/include-llvm/clwbintrin.h @@ -0,0 +1,38 @@ +/*===---- clwbintrin.h - CLWB intrinsic ------------------------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __CLWBINTRIN_H +#define __CLWBINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("clwb"))) + +/// Writes back to memory the cache line (if modified) that contains the +/// linear address specified in \a __p from any level of the cache hierarchy in +/// the cache coherence domain +/// +/// \headerfile +/// +/// This intrinsic corresponds to the CLWB instruction. +/// +/// \param __p +/// A pointer to the memory location used to identify the cache line to be +/// written back. +static __inline__ void __DEFAULT_FN_ATTRS +_mm_clwb(void const *__p) { + __builtin_ia32_clwb(__p); +} + +#undef __DEFAULT_FN_ATTRS + +#endif diff --git a/include-llvm/clzerointrin.h b/include-llvm/clzerointrin.h new file mode 100644 index 0000000..a180984 --- /dev/null +++ b/include-llvm/clzerointrin.h @@ -0,0 +1,36 @@ +/*===----------------------- clzerointrin.h - CLZERO ----------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __CLZEROINTRIN_H +#define __CLZEROINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("clzero"))) + +/// Loads the cache line address and zero's out the cacheline +/// +/// \headerfile +/// +/// This intrinsic corresponds to the CLZERO instruction. +/// +/// \param __line +/// A pointer to a cacheline which needs to be zeroed out. +static __inline__ void __DEFAULT_FN_ATTRS +_mm_clzero (void * __line) +{ + __builtin_ia32_clzero ((void *)__line); +} + +#undef __DEFAULT_FN_ATTRS + +#endif /* __CLZEROINTRIN_H */ diff --git a/include-llvm/crc32intrin.h b/include-llvm/crc32intrin.h new file mode 100644 index 0000000..a0bd99d --- /dev/null +++ b/include-llvm/crc32intrin.h @@ -0,0 +1,100 @@ +/*===---- crc32intrin.h - SSE4.2 Accumulate CRC32 intrinsics ---------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __CRC32INTRIN_H +#define __CRC32INTRIN_H + +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("crc32"))) + +/// Adds the unsigned integer operand to the CRC-32C checksum of the +/// unsigned char operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the CRC32B instruction. +/// +/// \param __C +/// An unsigned integer operand to add to the CRC-32C checksum of operand +/// \a __D. +/// \param __D +/// An unsigned 8-bit integer operand used to compute the CRC-32C checksum. +/// \returns The result of adding operand \a __C to the CRC-32C checksum of +/// operand \a __D. +static __inline__ unsigned int __DEFAULT_FN_ATTRS +_mm_crc32_u8(unsigned int __C, unsigned char __D) +{ + return __builtin_ia32_crc32qi(__C, __D); +} + +/// Adds the unsigned integer operand to the CRC-32C checksum of the +/// unsigned short operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the CRC32W instruction. +/// +/// \param __C +/// An unsigned integer operand to add to the CRC-32C checksum of operand +/// \a __D. +/// \param __D +/// An unsigned 16-bit integer operand used to compute the CRC-32C checksum. +/// \returns The result of adding operand \a __C to the CRC-32C checksum of +/// operand \a __D. +static __inline__ unsigned int __DEFAULT_FN_ATTRS +_mm_crc32_u16(unsigned int __C, unsigned short __D) +{ + return __builtin_ia32_crc32hi(__C, __D); +} + +/// Adds the first unsigned integer operand to the CRC-32C checksum of +/// the second unsigned integer operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the CRC32L instruction. +/// +/// \param __C +/// An unsigned integer operand to add to the CRC-32C checksum of operand +/// \a __D. +/// \param __D +/// An unsigned 32-bit integer operand used to compute the CRC-32C checksum. +/// \returns The result of adding operand \a __C to the CRC-32C checksum of +/// operand \a __D. +static __inline__ unsigned int __DEFAULT_FN_ATTRS +_mm_crc32_u32(unsigned int __C, unsigned int __D) +{ + return __builtin_ia32_crc32si(__C, __D); +} + +#ifdef __x86_64__ +/// Adds the unsigned integer operand to the CRC-32C checksum of the +/// unsigned 64-bit integer operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the CRC32Q instruction. +/// +/// \param __C +/// An unsigned integer operand to add to the CRC-32C checksum of operand +/// \a __D. +/// \param __D +/// An unsigned 64-bit integer operand used to compute the CRC-32C checksum. +/// \returns The result of adding operand \a __C to the CRC-32C checksum of +/// operand \a __D. +static __inline__ unsigned long long __DEFAULT_FN_ATTRS +_mm_crc32_u64(unsigned long long __C, unsigned long long __D) +{ + return __builtin_ia32_crc32di(__C, __D); +} +#endif /* __x86_64__ */ + +#undef __DEFAULT_FN_ATTRS + +#endif /* __CRC32INTRIN_H */ diff --git a/include-llvm/emmintrin.h b/include-llvm/emmintrin.h new file mode 100644 index 0000000..e00968e --- /dev/null +++ b/include-llvm/emmintrin.h @@ -0,0 +1,5045 @@ +/*===---- emmintrin.h - SSE2 intrinsics ------------------------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __EMMINTRIN_H +#define __EMMINTRIN_H + +#if !defined(__i386__) && !defined(__x86_64__) +#error "This header is only meant to be used on x86 and x64 architecture" +#endif + +#include + +typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16))); +typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16))); + +typedef double __m128d_u __attribute__((__vector_size__(16), __aligned__(1))); +typedef long long __m128i_u __attribute__((__vector_size__(16), __aligned__(1))); + +/* Type defines. */ +typedef double __v2df __attribute__ ((__vector_size__ (16))); +typedef long long __v2di __attribute__ ((__vector_size__ (16))); +typedef short __v8hi __attribute__((__vector_size__(16))); +typedef char __v16qi __attribute__((__vector_size__(16))); + +/* Unsigned types */ +typedef unsigned long long __v2du __attribute__ ((__vector_size__ (16))); +typedef unsigned short __v8hu __attribute__((__vector_size__(16))); +typedef unsigned char __v16qu __attribute__((__vector_size__(16))); + +/* We need an explicitly signed variant for char. Note that this shouldn't + * appear in the interface though. */ +typedef signed char __v16qs __attribute__((__vector_size__(16))); + +#if (__clang_major__ > 15) +#ifdef __SSE2__ +/* Both _Float16 and __bf16 require SSE2 being enabled. */ +typedef _Float16 __v8hf __attribute__((__vector_size__(16), __aligned__(16))); +typedef _Float16 __m128h __attribute__((__vector_size__(16), __aligned__(16))); +typedef _Float16 __m128h_u __attribute__((__vector_size__(16), __aligned__(1))); + +typedef __bf16 __v8bf __attribute__((__vector_size__(16), __aligned__(16))); +typedef __bf16 __m128bh __attribute__((__vector_size__(16), __aligned__(16))); +#endif +#endif + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2"), __min_vector_width__(128))) +#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse2"), __min_vector_width__(64))) + +/// Adds lower double-precision values in both operands and returns the +/// sum in the lower 64 bits of the result. The upper 64 bits of the result +/// are copied from the upper double-precision value of the first operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VADDSD / ADDSD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing one of the source operands. +/// \param __b +/// A 128-bit vector of [2 x double] containing one of the source operands. +/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the +/// sum of the lower 64 bits of both operands. The upper 64 bits are copied +/// from the upper 64 bits of the first source operand. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_add_sd(__m128d __a, __m128d __b) +{ + __a[0] += __b[0]; + return __a; +} + +/// Adds two 128-bit vectors of [2 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VADDPD / ADDPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing one of the source operands. +/// \param __b +/// A 128-bit vector of [2 x double] containing one of the source operands. +/// \returns A 128-bit vector of [2 x double] containing the sums of both +/// operands. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_add_pd(__m128d __a, __m128d __b) +{ + return (__m128d)((__v2df)__a + (__v2df)__b); +} + +/// Subtracts the lower double-precision value of the second operand +/// from the lower double-precision value of the first operand and returns +/// the difference in the lower 64 bits of the result. The upper 64 bits of +/// the result are copied from the upper double-precision value of the first +/// operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VSUBSD / SUBSD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing the minuend. +/// \param __b +/// A 128-bit vector of [2 x double] containing the subtrahend. +/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the +/// difference of the lower 64 bits of both operands. The upper 64 bits are +/// copied from the upper 64 bits of the first source operand. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_sub_sd(__m128d __a, __m128d __b) +{ + __a[0] -= __b[0]; + return __a; +} + +/// Subtracts two 128-bit vectors of [2 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VSUBPD / SUBPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing the minuend. +/// \param __b +/// A 128-bit vector of [2 x double] containing the subtrahend. +/// \returns A 128-bit vector of [2 x double] containing the differences between +/// both operands. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_sub_pd(__m128d __a, __m128d __b) +{ + return (__m128d)((__v2df)__a - (__v2df)__b); +} + +/// Multiplies lower double-precision values in both operands and returns +/// the product in the lower 64 bits of the result. The upper 64 bits of the +/// result are copied from the upper double-precision value of the first +/// operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMULSD / MULSD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing one of the source operands. +/// \param __b +/// A 128-bit vector of [2 x double] containing one of the source operands. +/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the +/// product of the lower 64 bits of both operands. The upper 64 bits are +/// copied from the upper 64 bits of the first source operand. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_mul_sd(__m128d __a, __m128d __b) +{ + __a[0] *= __b[0]; + return __a; +} + +/// Multiplies two 128-bit vectors of [2 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMULPD / MULPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing one of the operands. +/// \param __b +/// A 128-bit vector of [2 x double] containing one of the operands. +/// \returns A 128-bit vector of [2 x double] containing the products of both +/// operands. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_mul_pd(__m128d __a, __m128d __b) +{ + return (__m128d)((__v2df)__a * (__v2df)__b); +} + +/// Divides the lower double-precision value of the first operand by the +/// lower double-precision value of the second operand and returns the +/// quotient in the lower 64 bits of the result. The upper 64 bits of the +/// result are copied from the upper double-precision value of the first +/// operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VDIVSD / DIVSD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing the dividend. +/// \param __b +/// A 128-bit vector of [2 x double] containing divisor. +/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the +/// quotient of the lower 64 bits of both operands. The upper 64 bits are +/// copied from the upper 64 bits of the first source operand. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_div_sd(__m128d __a, __m128d __b) +{ + __a[0] /= __b[0]; + return __a; +} + +/// Performs an element-by-element division of two 128-bit vectors of +/// [2 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VDIVPD / DIVPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing the dividend. +/// \param __b +/// A 128-bit vector of [2 x double] containing the divisor. +/// \returns A 128-bit vector of [2 x double] containing the quotients of both +/// operands. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_div_pd(__m128d __a, __m128d __b) +{ + return (__m128d)((__v2df)__a / (__v2df)__b); +} + +/// Calculates the square root of the lower double-precision value of +/// the second operand and returns it in the lower 64 bits of the result. +/// The upper 64 bits of the result are copied from the upper +/// double-precision value of the first operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VSQRTSD / SQRTSD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing one of the operands. The +/// upper 64 bits of this operand are copied to the upper 64 bits of the +/// result. +/// \param __b +/// A 128-bit vector of [2 x double] containing one of the operands. The +/// square root is calculated using the lower 64 bits of this operand. +/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the +/// square root of the lower 64 bits of operand \a __b, and whose upper 64 +/// bits are copied from the upper 64 bits of operand \a __a. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_sqrt_sd(__m128d __a, __m128d __b) +{ + __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b); + return __extension__ (__m128d) { __c[0], __a[1] }; +} + +/// Calculates the square root of the each of two values stored in a +/// 128-bit vector of [2 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VSQRTPD / SQRTPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. +/// \returns A 128-bit vector of [2 x double] containing the square roots of the +/// values in the operand. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_sqrt_pd(__m128d __a) +{ + return __builtin_ia32_sqrtpd((__v2df)__a); +} + +/// Compares lower 64-bit double-precision values of both operands, and +/// returns the lesser of the pair of values in the lower 64-bits of the +/// result. The upper 64 bits of the result are copied from the upper +/// double-precision value of the first operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMINSD / MINSD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing one of the operands. The +/// lower 64 bits of this operand are used in the comparison. +/// \param __b +/// A 128-bit vector of [2 x double] containing one of the operands. The +/// lower 64 bits of this operand are used in the comparison. +/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the +/// minimum value between both operands. The upper 64 bits are copied from +/// the upper 64 bits of the first source operand. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_min_sd(__m128d __a, __m128d __b) +{ + return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b); +} + +/// Performs element-by-element comparison of the two 128-bit vectors of +/// [2 x double] and returns the vector containing the lesser of each pair of +/// values. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMINPD / MINPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing one of the operands. +/// \param __b +/// A 128-bit vector of [2 x double] containing one of the operands. +/// \returns A 128-bit vector of [2 x double] containing the minimum values +/// between both operands. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_min_pd(__m128d __a, __m128d __b) +{ + return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b); +} + +/// Compares lower 64-bit double-precision values of both operands, and +/// returns the greater of the pair of values in the lower 64-bits of the +/// result. The upper 64 bits of the result are copied from the upper +/// double-precision value of the first operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMAXSD / MAXSD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing one of the operands. The +/// lower 64 bits of this operand are used in the comparison. +/// \param __b +/// A 128-bit vector of [2 x double] containing one of the operands. The +/// lower 64 bits of this operand are used in the comparison. +/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the +/// maximum value between both operands. The upper 64 bits are copied from +/// the upper 64 bits of the first source operand. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_max_sd(__m128d __a, __m128d __b) +{ + return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b); +} + +/// Performs element-by-element comparison of the two 128-bit vectors of +/// [2 x double] and returns the vector containing the greater of each pair +/// of values. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMAXPD / MAXPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing one of the operands. +/// \param __b +/// A 128-bit vector of [2 x double] containing one of the operands. +/// \returns A 128-bit vector of [2 x double] containing the maximum values +/// between both operands. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_max_pd(__m128d __a, __m128d __b) +{ + return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b); +} + +/// Performs a bitwise AND of two 128-bit vectors of [2 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPAND / PAND instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing one of the source operands. +/// \param __b +/// A 128-bit vector of [2 x double] containing one of the source operands. +/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the +/// values between both operands. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_and_pd(__m128d __a, __m128d __b) +{ + return (__m128d)((__v2du)__a & (__v2du)__b); +} + +/// Performs a bitwise AND of two 128-bit vectors of [2 x double], using +/// the one's complement of the values contained in the first source operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPANDN / PANDN instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing the left source operand. The +/// one's complement of this value is used in the bitwise AND. +/// \param __b +/// A 128-bit vector of [2 x double] containing the right source operand. +/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the +/// values in the second operand and the one's complement of the first +/// operand. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_andnot_pd(__m128d __a, __m128d __b) +{ + return (__m128d)(~(__v2du)__a & (__v2du)__b); +} + +/// Performs a bitwise OR of two 128-bit vectors of [2 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPOR / POR instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing one of the source operands. +/// \param __b +/// A 128-bit vector of [2 x double] containing one of the source operands. +/// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the +/// values between both operands. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_or_pd(__m128d __a, __m128d __b) +{ + return (__m128d)((__v2du)__a | (__v2du)__b); +} + +/// Performs a bitwise XOR of two 128-bit vectors of [2 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPXOR / PXOR instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing one of the source operands. +/// \param __b +/// A 128-bit vector of [2 x double] containing one of the source operands. +/// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the +/// values between both operands. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_xor_pd(__m128d __a, __m128d __b) +{ + return (__m128d)((__v2du)__a ^ (__v2du)__b); +} + +/// Compares each of the corresponding double-precision values of the +/// 128-bit vectors of [2 x double] for equality. Each comparison yields 0x0 +/// for false, 0xFFFFFFFFFFFFFFFF for true. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCMPEQPD / CMPEQPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. +/// \param __b +/// A 128-bit vector of [2 x double]. +/// \returns A 128-bit vector containing the comparison results. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_cmpeq_pd(__m128d __a, __m128d __b) +{ + return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b); +} + +/// Compares each of the corresponding double-precision values of the +/// 128-bit vectors of [2 x double] to determine if the values in the first +/// operand are less than those in the second operand. Each comparison +/// yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCMPLTPD / CMPLTPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. +/// \param __b +/// A 128-bit vector of [2 x double]. +/// \returns A 128-bit vector containing the comparison results. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_cmplt_pd(__m128d __a, __m128d __b) +{ + return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b); +} + +/// Compares each of the corresponding double-precision values of the +/// 128-bit vectors of [2 x double] to determine if the values in the first +/// operand are less than or equal to those in the second operand. +/// +/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCMPLEPD / CMPLEPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. +/// \param __b +/// A 128-bit vector of [2 x double]. +/// \returns A 128-bit vector containing the comparison results. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_cmple_pd(__m128d __a, __m128d __b) +{ + return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b); +} + +/// Compares each of the corresponding double-precision values of the +/// 128-bit vectors of [2 x double] to determine if the values in the first +/// operand are greater than those in the second operand. +/// +/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCMPLTPD / CMPLTPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. +/// \param __b +/// A 128-bit vector of [2 x double]. +/// \returns A 128-bit vector containing the comparison results. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_cmpgt_pd(__m128d __a, __m128d __b) +{ + return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a); +} + +/// Compares each of the corresponding double-precision values of the +/// 128-bit vectors of [2 x double] to determine if the values in the first +/// operand are greater than or equal to those in the second operand. +/// +/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCMPLEPD / CMPLEPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. +/// \param __b +/// A 128-bit vector of [2 x double]. +/// \returns A 128-bit vector containing the comparison results. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_cmpge_pd(__m128d __a, __m128d __b) +{ + return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a); +} + +/// Compares each of the corresponding double-precision values of the +/// 128-bit vectors of [2 x double] to determine if the values in the first +/// operand are ordered with respect to those in the second operand. +/// +/// A pair of double-precision values are "ordered" with respect to each +/// other if neither value is a NaN. Each comparison yields 0x0 for false, +/// 0xFFFFFFFFFFFFFFFF for true. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCMPORDPD / CMPORDPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. +/// \param __b +/// A 128-bit vector of [2 x double]. +/// \returns A 128-bit vector containing the comparison results. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_cmpord_pd(__m128d __a, __m128d __b) +{ + return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b); +} + +/// Compares each of the corresponding double-precision values of the +/// 128-bit vectors of [2 x double] to determine if the values in the first +/// operand are unordered with respect to those in the second operand. +/// +/// A pair of double-precision values are "unordered" with respect to each +/// other if one or both values are NaN. Each comparison yields 0x0 for +/// false, 0xFFFFFFFFFFFFFFFF for true. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCMPUNORDPD / CMPUNORDPD +/// instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. +/// \param __b +/// A 128-bit vector of [2 x double]. +/// \returns A 128-bit vector containing the comparison results. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_cmpunord_pd(__m128d __a, __m128d __b) +{ + return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b); +} + +/// Compares each of the corresponding double-precision values of the +/// 128-bit vectors of [2 x double] to determine if the values in the first +/// operand are unequal to those in the second operand. +/// +/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCMPNEQPD / CMPNEQPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. +/// \param __b +/// A 128-bit vector of [2 x double]. +/// \returns A 128-bit vector containing the comparison results. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_cmpneq_pd(__m128d __a, __m128d __b) +{ + return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b); +} + +/// Compares each of the corresponding double-precision values of the +/// 128-bit vectors of [2 x double] to determine if the values in the first +/// operand are not less than those in the second operand. +/// +/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCMPNLTPD / CMPNLTPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. +/// \param __b +/// A 128-bit vector of [2 x double]. +/// \returns A 128-bit vector containing the comparison results. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_cmpnlt_pd(__m128d __a, __m128d __b) +{ + return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b); +} + +/// Compares each of the corresponding double-precision values of the +/// 128-bit vectors of [2 x double] to determine if the values in the first +/// operand are not less than or equal to those in the second operand. +/// +/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCMPNLEPD / CMPNLEPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. +/// \param __b +/// A 128-bit vector of [2 x double]. +/// \returns A 128-bit vector containing the comparison results. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_cmpnle_pd(__m128d __a, __m128d __b) +{ + return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b); +} + +/// Compares each of the corresponding double-precision values of the +/// 128-bit vectors of [2 x double] to determine if the values in the first +/// operand are not greater than those in the second operand. +/// +/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCMPNLTPD / CMPNLTPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. +/// \param __b +/// A 128-bit vector of [2 x double]. +/// \returns A 128-bit vector containing the comparison results. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_cmpngt_pd(__m128d __a, __m128d __b) +{ + return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a); +} + +/// Compares each of the corresponding double-precision values of the +/// 128-bit vectors of [2 x double] to determine if the values in the first +/// operand are not greater than or equal to those in the second operand. +/// +/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCMPNLEPD / CMPNLEPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. +/// \param __b +/// A 128-bit vector of [2 x double]. +/// \returns A 128-bit vector containing the comparison results. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_cmpnge_pd(__m128d __a, __m128d __b) +{ + return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a); +} + +/// Compares the lower double-precision floating-point values in each of +/// the two 128-bit floating-point vectors of [2 x double] for equality. +/// +/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCMPEQSD / CMPEQSD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __b. +/// \param __b +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __a. +/// \returns A 128-bit vector. The lower 64 bits contains the comparison +/// results. The upper 64 bits are copied from the upper 64 bits of \a __a. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_cmpeq_sd(__m128d __a, __m128d __b) +{ + return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b); +} + +/// Compares the lower double-precision floating-point values in each of +/// the two 128-bit floating-point vectors of [2 x double] to determine if +/// the value in the first parameter is less than the corresponding value in +/// the second parameter. +/// +/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCMPLTSD / CMPLTSD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __b. +/// \param __b +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __a. +/// \returns A 128-bit vector. The lower 64 bits contains the comparison +/// results. The upper 64 bits are copied from the upper 64 bits of \a __a. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_cmplt_sd(__m128d __a, __m128d __b) +{ + return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b); +} + +/// Compares the lower double-precision floating-point values in each of +/// the two 128-bit floating-point vectors of [2 x double] to determine if +/// the value in the first parameter is less than or equal to the +/// corresponding value in the second parameter. +/// +/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCMPLESD / CMPLESD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __b. +/// \param __b +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __a. +/// \returns A 128-bit vector. The lower 64 bits contains the comparison +/// results. The upper 64 bits are copied from the upper 64 bits of \a __a. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_cmple_sd(__m128d __a, __m128d __b) +{ + return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b); +} + +/// Compares the lower double-precision floating-point values in each of +/// the two 128-bit floating-point vectors of [2 x double] to determine if +/// the value in the first parameter is greater than the corresponding value +/// in the second parameter. +/// +/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCMPLTSD / CMPLTSD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __b. +/// \param __b +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __a. +/// \returns A 128-bit vector. The lower 64 bits contains the comparison +/// results. The upper 64 bits are copied from the upper 64 bits of \a __a. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_cmpgt_sd(__m128d __a, __m128d __b) +{ + __m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a); + return __extension__ (__m128d) { __c[0], __a[1] }; +} + +/// Compares the lower double-precision floating-point values in each of +/// the two 128-bit floating-point vectors of [2 x double] to determine if +/// the value in the first parameter is greater than or equal to the +/// corresponding value in the second parameter. +/// +/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCMPLESD / CMPLESD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __b. +/// \param __b +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __a. +/// \returns A 128-bit vector. The lower 64 bits contains the comparison +/// results. The upper 64 bits are copied from the upper 64 bits of \a __a. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_cmpge_sd(__m128d __a, __m128d __b) +{ + __m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a); + return __extension__ (__m128d) { __c[0], __a[1] }; +} + +/// Compares the lower double-precision floating-point values in each of +/// the two 128-bit floating-point vectors of [2 x double] to determine if +/// the value in the first parameter is "ordered" with respect to the +/// corresponding value in the second parameter. +/// +/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair +/// of double-precision values are "ordered" with respect to each other if +/// neither value is a NaN. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCMPORDSD / CMPORDSD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __b. +/// \param __b +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __a. +/// \returns A 128-bit vector. The lower 64 bits contains the comparison +/// results. The upper 64 bits are copied from the upper 64 bits of \a __a. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_cmpord_sd(__m128d __a, __m128d __b) +{ + return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b); +} + +/// Compares the lower double-precision floating-point values in each of +/// the two 128-bit floating-point vectors of [2 x double] to determine if +/// the value in the first parameter is "unordered" with respect to the +/// corresponding value in the second parameter. +/// +/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair +/// of double-precision values are "unordered" with respect to each other if +/// one or both values are NaN. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCMPUNORDSD / CMPUNORDSD +/// instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __b. +/// \param __b +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __a. +/// \returns A 128-bit vector. The lower 64 bits contains the comparison +/// results. The upper 64 bits are copied from the upper 64 bits of \a __a. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_cmpunord_sd(__m128d __a, __m128d __b) +{ + return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b); +} + +/// Compares the lower double-precision floating-point values in each of +/// the two 128-bit floating-point vectors of [2 x double] to determine if +/// the value in the first parameter is unequal to the corresponding value in +/// the second parameter. +/// +/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCMPNEQSD / CMPNEQSD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __b. +/// \param __b +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __a. +/// \returns A 128-bit vector. The lower 64 bits contains the comparison +/// results. The upper 64 bits are copied from the upper 64 bits of \a __a. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_cmpneq_sd(__m128d __a, __m128d __b) +{ + return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b); +} + +/// Compares the lower double-precision floating-point values in each of +/// the two 128-bit floating-point vectors of [2 x double] to determine if +/// the value in the first parameter is not less than the corresponding +/// value in the second parameter. +/// +/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCMPNLTSD / CMPNLTSD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __b. +/// \param __b +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __a. +/// \returns A 128-bit vector. The lower 64 bits contains the comparison +/// results. The upper 64 bits are copied from the upper 64 bits of \a __a. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_cmpnlt_sd(__m128d __a, __m128d __b) +{ + return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b); +} + +/// Compares the lower double-precision floating-point values in each of +/// the two 128-bit floating-point vectors of [2 x double] to determine if +/// the value in the first parameter is not less than or equal to the +/// corresponding value in the second parameter. +/// +/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCMPNLESD / CMPNLESD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __b. +/// \param __b +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __a. +/// \returns A 128-bit vector. The lower 64 bits contains the comparison +/// results. The upper 64 bits are copied from the upper 64 bits of \a __a. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_cmpnle_sd(__m128d __a, __m128d __b) +{ + return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b); +} + +/// Compares the lower double-precision floating-point values in each of +/// the two 128-bit floating-point vectors of [2 x double] to determine if +/// the value in the first parameter is not greater than the corresponding +/// value in the second parameter. +/// +/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCMPNLTSD / CMPNLTSD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __b. +/// \param __b +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __a. +/// \returns A 128-bit vector. The lower 64 bits contains the comparison +/// results. The upper 64 bits are copied from the upper 64 bits of \a __a. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_cmpngt_sd(__m128d __a, __m128d __b) +{ + __m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a); + return __extension__ (__m128d) { __c[0], __a[1] }; +} + +/// Compares the lower double-precision floating-point values in each of +/// the two 128-bit floating-point vectors of [2 x double] to determine if +/// the value in the first parameter is not greater than or equal to the +/// corresponding value in the second parameter. +/// +/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCMPNLESD / CMPNLESD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __b. +/// \param __b +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __a. +/// \returns A 128-bit vector. The lower 64 bits contains the comparison +/// results. The upper 64 bits are copied from the upper 64 bits of \a __a. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_cmpnge_sd(__m128d __a, __m128d __b) +{ + __m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a); + return __extension__ (__m128d) { __c[0], __a[1] }; +} + +/// Compares the lower double-precision floating-point values in each of +/// the two 128-bit floating-point vectors of [2 x double] for equality. +/// +/// The comparison yields 0 for false, 1 for true. If either of the two +/// lower double-precision values is NaN, 0 is returned. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCOMISD / COMISD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __b. +/// \param __b +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __a. +/// \returns An integer containing the comparison results. If either of the two +/// lower double-precision values is NaN, 0 is returned. +static __inline__ int __DEFAULT_FN_ATTRS +_mm_comieq_sd(__m128d __a, __m128d __b) +{ + return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b); +} + +/// Compares the lower double-precision floating-point values in each of +/// the two 128-bit floating-point vectors of [2 x double] to determine if +/// the value in the first parameter is less than the corresponding value in +/// the second parameter. +/// +/// The comparison yields 0 for false, 1 for true. If either of the two +/// lower double-precision values is NaN, 0 is returned. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCOMISD / COMISD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __b. +/// \param __b +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __a. +/// \returns An integer containing the comparison results. If either of the two +/// lower double-precision values is NaN, 0 is returned. +static __inline__ int __DEFAULT_FN_ATTRS +_mm_comilt_sd(__m128d __a, __m128d __b) +{ + return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b); +} + +/// Compares the lower double-precision floating-point values in each of +/// the two 128-bit floating-point vectors of [2 x double] to determine if +/// the value in the first parameter is less than or equal to the +/// corresponding value in the second parameter. +/// +/// The comparison yields 0 for false, 1 for true. If either of the two +/// lower double-precision values is NaN, 0 is returned. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCOMISD / COMISD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __b. +/// \param __b +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __a. +/// \returns An integer containing the comparison results. If either of the two +/// lower double-precision values is NaN, 0 is returned. +static __inline__ int __DEFAULT_FN_ATTRS +_mm_comile_sd(__m128d __a, __m128d __b) +{ + return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b); +} + +/// Compares the lower double-precision floating-point values in each of +/// the two 128-bit floating-point vectors of [2 x double] to determine if +/// the value in the first parameter is greater than the corresponding value +/// in the second parameter. +/// +/// The comparison yields 0 for false, 1 for true. If either of the two +/// lower double-precision values is NaN, 0 is returned. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCOMISD / COMISD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __b. +/// \param __b +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __a. +/// \returns An integer containing the comparison results. If either of the two +/// lower double-precision values is NaN, 0 is returned. +static __inline__ int __DEFAULT_FN_ATTRS +_mm_comigt_sd(__m128d __a, __m128d __b) +{ + return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b); +} + +/// Compares the lower double-precision floating-point values in each of +/// the two 128-bit floating-point vectors of [2 x double] to determine if +/// the value in the first parameter is greater than or equal to the +/// corresponding value in the second parameter. +/// +/// The comparison yields 0 for false, 1 for true. If either of the two +/// lower double-precision values is NaN, 0 is returned. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCOMISD / COMISD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __b. +/// \param __b +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __a. +/// \returns An integer containing the comparison results. If either of the two +/// lower double-precision values is NaN, 0 is returned. +static __inline__ int __DEFAULT_FN_ATTRS +_mm_comige_sd(__m128d __a, __m128d __b) +{ + return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b); +} + +/// Compares the lower double-precision floating-point values in each of +/// the two 128-bit floating-point vectors of [2 x double] to determine if +/// the value in the first parameter is unequal to the corresponding value in +/// the second parameter. +/// +/// The comparison yields 0 for false, 1 for true. If either of the two +/// lower double-precision values is NaN, 1 is returned. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCOMISD / COMISD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __b. +/// \param __b +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __a. +/// \returns An integer containing the comparison results. If either of the two +/// lower double-precision values is NaN, 1 is returned. +static __inline__ int __DEFAULT_FN_ATTRS +_mm_comineq_sd(__m128d __a, __m128d __b) +{ + return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b); +} + +/// Compares the lower double-precision floating-point values in each of +/// the two 128-bit floating-point vectors of [2 x double] for equality. The +/// comparison yields 0 for false, 1 for true. +/// +/// If either of the two lower double-precision values is NaN, 0 is returned. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VUCOMISD / UCOMISD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __b. +/// \param __b +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __a. +/// \returns An integer containing the comparison results. If either of the two +/// lower double-precision values is NaN, 0 is returned. +static __inline__ int __DEFAULT_FN_ATTRS +_mm_ucomieq_sd(__m128d __a, __m128d __b) +{ + return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b); +} + +/// Compares the lower double-precision floating-point values in each of +/// the two 128-bit floating-point vectors of [2 x double] to determine if +/// the value in the first parameter is less than the corresponding value in +/// the second parameter. +/// +/// The comparison yields 0 for false, 1 for true. If either of the two lower +/// double-precision values is NaN, 0 is returned. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VUCOMISD / UCOMISD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __b. +/// \param __b +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __a. +/// \returns An integer containing the comparison results. If either of the two +/// lower double-precision values is NaN, 0 is returned. +static __inline__ int __DEFAULT_FN_ATTRS +_mm_ucomilt_sd(__m128d __a, __m128d __b) +{ + return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b); +} + +/// Compares the lower double-precision floating-point values in each of +/// the two 128-bit floating-point vectors of [2 x double] to determine if +/// the value in the first parameter is less than or equal to the +/// corresponding value in the second parameter. +/// +/// The comparison yields 0 for false, 1 for true. If either of the two lower +/// double-precision values is NaN, 0 is returned. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VUCOMISD / UCOMISD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __b. +/// \param __b +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __a. +/// \returns An integer containing the comparison results. If either of the two +/// lower double-precision values is NaN, 0 is returned. +static __inline__ int __DEFAULT_FN_ATTRS +_mm_ucomile_sd(__m128d __a, __m128d __b) +{ + return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b); +} + +/// Compares the lower double-precision floating-point values in each of +/// the two 128-bit floating-point vectors of [2 x double] to determine if +/// the value in the first parameter is greater than the corresponding value +/// in the second parameter. +/// +/// The comparison yields 0 for false, 1 for true. If either of the two lower +/// double-precision values is NaN, 0 is returned. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VUCOMISD / UCOMISD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __b. +/// \param __b +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __a. +/// \returns An integer containing the comparison results. If either of the two +/// lower double-precision values is NaN, 0 is returned. +static __inline__ int __DEFAULT_FN_ATTRS +_mm_ucomigt_sd(__m128d __a, __m128d __b) +{ + return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b); +} + +/// Compares the lower double-precision floating-point values in each of +/// the two 128-bit floating-point vectors of [2 x double] to determine if +/// the value in the first parameter is greater than or equal to the +/// corresponding value in the second parameter. +/// +/// The comparison yields 0 for false, 1 for true. If either of the two +/// lower double-precision values is NaN, 0 is returned. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VUCOMISD / UCOMISD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __b. +/// \param __b +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __a. +/// \returns An integer containing the comparison results. If either of the two +/// lower double-precision values is NaN, 0 is returned. +static __inline__ int __DEFAULT_FN_ATTRS +_mm_ucomige_sd(__m128d __a, __m128d __b) +{ + return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b); +} + +/// Compares the lower double-precision floating-point values in each of +/// the two 128-bit floating-point vectors of [2 x double] to determine if +/// the value in the first parameter is unequal to the corresponding value in +/// the second parameter. +/// +/// The comparison yields 0 for false, 1 for true. If either of the two lower +/// double-precision values is NaN, 1 is returned. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VUCOMISD / UCOMISD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __b. +/// \param __b +/// A 128-bit vector of [2 x double]. The lower double-precision value is +/// compared to the lower double-precision value of \a __a. +/// \returns An integer containing the comparison result. If either of the two +/// lower double-precision values is NaN, 1 is returned. +static __inline__ int __DEFAULT_FN_ATTRS +_mm_ucomineq_sd(__m128d __a, __m128d __b) +{ + return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b); +} + +/// Converts the two double-precision floating-point elements of a +/// 128-bit vector of [2 x double] into two single-precision floating-point +/// values, returned in the lower 64 bits of a 128-bit vector of [4 x float]. +/// The upper 64 bits of the result vector are set to zero. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTPD2PS / CVTPD2PS instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. +/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the +/// converted values. The upper 64 bits are set to zero. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_cvtpd_ps(__m128d __a) +{ + return __builtin_ia32_cvtpd2ps((__v2df)__a); +} + +/// Converts the lower two single-precision floating-point elements of a +/// 128-bit vector of [4 x float] into two double-precision floating-point +/// values, returned in a 128-bit vector of [2 x double]. The upper two +/// elements of the input vector are unused. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTPS2PD / CVTPS2PD instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. The lower two single-precision +/// floating-point elements are converted to double-precision values. The +/// upper two elements are unused. +/// \returns A 128-bit vector of [2 x double] containing the converted values. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_cvtps_pd(__m128 __a) +{ + return (__m128d) __builtin_convertvector( + __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df); +} + +/// Converts the lower two integer elements of a 128-bit vector of +/// [4 x i32] into two double-precision floating-point values, returned in a +/// 128-bit vector of [2 x double]. +/// +/// The upper two elements of the input vector are unused. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTDQ2PD / CVTDQ2PD instruction. +/// +/// \param __a +/// A 128-bit integer vector of [4 x i32]. The lower two integer elements are +/// converted to double-precision values. +/// +/// The upper two elements are unused. +/// \returns A 128-bit vector of [2 x double] containing the converted values. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_cvtepi32_pd(__m128i __a) +{ + return (__m128d) __builtin_convertvector( + __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df); +} + +/// Converts the two double-precision floating-point elements of a +/// 128-bit vector of [2 x double] into two signed 32-bit integer values, +/// returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper +/// 64 bits of the result vector are set to zero. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTPD2DQ / CVTPD2DQ instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. +/// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the +/// converted values. The upper 64 bits are set to zero. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_cvtpd_epi32(__m128d __a) +{ + return __builtin_ia32_cvtpd2dq((__v2df)__a); +} + +/// Converts the low-order element of a 128-bit vector of [2 x double] +/// into a 32-bit signed integer value. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTSD2SI / CVTSD2SI instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the +/// conversion. +/// \returns A 32-bit signed integer containing the converted value. +static __inline__ int __DEFAULT_FN_ATTRS +_mm_cvtsd_si32(__m128d __a) +{ + return __builtin_ia32_cvtsd2si((__v2df)__a); +} + +/// Converts the lower double-precision floating-point element of a +/// 128-bit vector of [2 x double], in the second parameter, into a +/// single-precision floating-point value, returned in the lower 32 bits of a +/// 128-bit vector of [4 x float]. The upper 96 bits of the result vector are +/// copied from the upper 96 bits of the first parameter. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTSD2SS / CVTSD2SS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. The upper 96 bits of this parameter are +/// copied to the upper 96 bits of the result. +/// \param __b +/// A 128-bit vector of [2 x double]. The lower double-precision +/// floating-point element is used in the conversion. +/// \returns A 128-bit vector of [4 x float]. The lower 32 bits contain the +/// converted value from the second parameter. The upper 96 bits are copied +/// from the upper 96 bits of the first parameter. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_cvtsd_ss(__m128 __a, __m128d __b) +{ + return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__a, (__v2df)__b); +} + +/// Converts a 32-bit signed integer value, in the second parameter, into +/// a double-precision floating-point value, returned in the lower 64 bits of +/// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector +/// are copied from the upper 64 bits of the first parameter. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTSI2SD / CVTSI2SD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are +/// copied to the upper 64 bits of the result. +/// \param __b +/// A 32-bit signed integer containing the value to be converted. +/// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the +/// converted value from the second parameter. The upper 64 bits are copied +/// from the upper 64 bits of the first parameter. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_cvtsi32_sd(__m128d __a, int __b) +{ + __a[0] = __b; + return __a; +} + +/// Converts the lower single-precision floating-point element of a +/// 128-bit vector of [4 x float], in the second parameter, into a +/// double-precision floating-point value, returned in the lower 64 bits of +/// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector +/// are copied from the upper 64 bits of the first parameter. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTSS2SD / CVTSS2SD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are +/// copied to the upper 64 bits of the result. +/// \param __b +/// A 128-bit vector of [4 x float]. The lower single-precision +/// floating-point element is used in the conversion. +/// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the +/// converted value from the second parameter. The upper 64 bits are copied +/// from the upper 64 bits of the first parameter. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_cvtss_sd(__m128d __a, __m128 __b) +{ + __a[0] = __b[0]; + return __a; +} + +/// Converts the two double-precision floating-point elements of a +/// 128-bit vector of [2 x double] into two signed 32-bit integer values, +/// returned in the lower 64 bits of a 128-bit vector of [4 x i32]. +/// +/// If the result of either conversion is inexact, the result is truncated +/// (rounded towards zero) regardless of the current MXCSR setting. The upper +/// 64 bits of the result vector are set to zero. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTTPD2DQ / CVTTPD2DQ +/// instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. +/// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the +/// converted values. The upper 64 bits are set to zero. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_cvttpd_epi32(__m128d __a) +{ + return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a); +} + +/// Converts the low-order element of a [2 x double] vector into a 32-bit +/// signed integer value, truncating the result when it is inexact. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTTSD2SI / CVTTSD2SI +/// instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the +/// conversion. +/// \returns A 32-bit signed integer containing the converted value. +static __inline__ int __DEFAULT_FN_ATTRS +_mm_cvttsd_si32(__m128d __a) +{ + return __builtin_ia32_cvttsd2si((__v2df)__a); +} + +/// Converts the two double-precision floating-point elements of a +/// 128-bit vector of [2 x double] into two signed 32-bit integer values, +/// returned in a 64-bit vector of [2 x i32]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the CVTPD2PI instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. +/// \returns A 64-bit vector of [2 x i32] containing the converted values. +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +_mm_cvtpd_pi32(__m128d __a) +{ + return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a); +} + +/// Converts the two double-precision floating-point elements of a +/// 128-bit vector of [2 x double] into two signed 32-bit integer values, +/// returned in a 64-bit vector of [2 x i32]. +/// +/// If the result of either conversion is inexact, the result is truncated +/// (rounded towards zero) regardless of the current MXCSR setting. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the CVTTPD2PI instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. +/// \returns A 64-bit vector of [2 x i32] containing the converted values. +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +_mm_cvttpd_pi32(__m128d __a) +{ + return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a); +} + +/// Converts the two signed 32-bit integer elements of a 64-bit vector of +/// [2 x i32] into two double-precision floating-point values, returned in a +/// 128-bit vector of [2 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the CVTPI2PD instruction. +/// +/// \param __a +/// A 64-bit vector of [2 x i32]. +/// \returns A 128-bit vector of [2 x double] containing the converted values. +static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX +_mm_cvtpi32_pd(__m64 __a) +{ + return __builtin_ia32_cvtpi2pd((__v2si)__a); +} + +/// Returns the low-order element of a 128-bit vector of [2 x double] as +/// a double-precision floating-point value. +/// +/// \headerfile +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The lower 64 bits are returned. +/// \returns A double-precision floating-point value copied from the lower 64 +/// bits of \a __a. +static __inline__ double __DEFAULT_FN_ATTRS +_mm_cvtsd_f64(__m128d __a) +{ + return __a[0]; +} + +/// Loads a 128-bit floating-point vector of [2 x double] from an aligned +/// memory location. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVAPD / MOVAPD instruction. +/// +/// \param __dp +/// A pointer to a 128-bit memory location. The address of the memory +/// location has to be 16-byte aligned. +/// \returns A 128-bit vector of [2 x double] containing the loaded values. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_load_pd(double const *__dp) +{ + return *(const __m128d*)__dp; +} + +/// Loads a double-precision floating-point value from a specified memory +/// location and duplicates it to both vector elements of a 128-bit vector of +/// [2 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVDDUP / MOVDDUP instruction. +/// +/// \param __dp +/// A pointer to a memory location containing a double-precision value. +/// \returns A 128-bit vector of [2 x double] containing the loaded and +/// duplicated values. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_load1_pd(double const *__dp) +{ + struct __mm_load1_pd_struct { + double __u; + } __attribute__((__packed__, __may_alias__)); + double __u = ((const struct __mm_load1_pd_struct*)__dp)->__u; + return __extension__ (__m128d){ __u, __u }; +} + +#define _mm_load_pd1(dp) _mm_load1_pd(dp) + +/// Loads two double-precision values, in reverse order, from an aligned +/// memory location into a 128-bit vector of [2 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVAPD / MOVAPD instruction + +/// needed shuffling instructions. In AVX mode, the shuffling may be combined +/// with the \c VMOVAPD, resulting in only a \c VPERMILPD instruction. +/// +/// \param __dp +/// A 16-byte aligned pointer to an array of double-precision values to be +/// loaded in reverse order. +/// \returns A 128-bit vector of [2 x double] containing the reversed loaded +/// values. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_loadr_pd(double const *__dp) +{ + __m128d __u = *(const __m128d*)__dp; + return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0); +} + +/// Loads a 128-bit floating-point vector of [2 x double] from an +/// unaligned memory location. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVUPD / MOVUPD instruction. +/// +/// \param __dp +/// A pointer to a 128-bit memory location. The address of the memory +/// location does not have to be aligned. +/// \returns A 128-bit vector of [2 x double] containing the loaded values. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_loadu_pd(double const *__dp) +{ + struct __loadu_pd { + __m128d_u __v; + } __attribute__((__packed__, __may_alias__)); + return ((const struct __loadu_pd*)__dp)->__v; +} + +/// Loads a 64-bit integer value to the low element of a 128-bit integer +/// vector and clears the upper element. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVQ / MOVQ instruction. +/// +/// \param __a +/// A pointer to a 64-bit memory location. The address of the memory +/// location does not have to be aligned. +/// \returns A 128-bit vector of [2 x i64] containing the loaded value. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_loadu_si64(void const *__a) +{ + struct __loadu_si64 { + long long __v; + } __attribute__((__packed__, __may_alias__)); + long long __u = ((const struct __loadu_si64*)__a)->__v; + return __extension__ (__m128i)(__v2di){__u, 0LL}; +} + +/// Loads a 32-bit integer value to the low element of a 128-bit integer +/// vector and clears the upper element. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVD / MOVD instruction. +/// +/// \param __a +/// A pointer to a 32-bit memory location. The address of the memory +/// location does not have to be aligned. +/// \returns A 128-bit vector of [4 x i32] containing the loaded value. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_loadu_si32(void const *__a) +{ + struct __loadu_si32 { + int __v; + } __attribute__((__packed__, __may_alias__)); + int __u = ((const struct __loadu_si32*)__a)->__v; + return __extension__ (__m128i)(__v4si){__u, 0, 0, 0}; +} + +/// Loads a 16-bit integer value to the low element of a 128-bit integer +/// vector and clears the upper element. +/// +/// \headerfile +/// +/// This intrinsic does not correspond to a specific instruction. +/// +/// \param __a +/// A pointer to a 16-bit memory location. The address of the memory +/// location does not have to be aligned. +/// \returns A 128-bit vector of [8 x i16] containing the loaded value. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_loadu_si16(void const *__a) +{ + struct __loadu_si16 { + short __v; + } __attribute__((__packed__, __may_alias__)); + short __u = ((const struct __loadu_si16*)__a)->__v; + return __extension__ (__m128i)(__v8hi){__u, 0, 0, 0, 0, 0, 0, 0}; +} + +/// Loads a 64-bit double-precision value to the low element of a +/// 128-bit integer vector and clears the upper element. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVSD / MOVSD instruction. +/// +/// \param __dp +/// A pointer to a memory location containing a double-precision value. +/// The address of the memory location does not have to be aligned. +/// \returns A 128-bit vector of [2 x double] containing the loaded value. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_load_sd(double const *__dp) +{ + struct __mm_load_sd_struct { + double __u; + } __attribute__((__packed__, __may_alias__)); + double __u = ((const struct __mm_load_sd_struct*)__dp)->__u; + return __extension__ (__m128d){ __u, 0 }; +} + +/// Loads a double-precision value into the high-order bits of a 128-bit +/// vector of [2 x double]. The low-order bits are copied from the low-order +/// bits of the first operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVHPD / MOVHPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. \n +/// Bits [63:0] are written to bits [63:0] of the result. +/// \param __dp +/// A pointer to a 64-bit memory location containing a double-precision +/// floating-point value that is loaded. The loaded value is written to bits +/// [127:64] of the result. The address of the memory location does not have +/// to be aligned. +/// \returns A 128-bit vector of [2 x double] containing the moved values. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_loadh_pd(__m128d __a, double const *__dp) +{ + struct __mm_loadh_pd_struct { + double __u; + } __attribute__((__packed__, __may_alias__)); + double __u = ((const struct __mm_loadh_pd_struct*)__dp)->__u; + return __extension__ (__m128d){ __a[0], __u }; +} + +/// Loads a double-precision value into the low-order bits of a 128-bit +/// vector of [2 x double]. The high-order bits are copied from the +/// high-order bits of the first operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVLPD / MOVLPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. \n +/// Bits [127:64] are written to bits [127:64] of the result. +/// \param __dp +/// A pointer to a 64-bit memory location containing a double-precision +/// floating-point value that is loaded. The loaded value is written to bits +/// [63:0] of the result. The address of the memory location does not have to +/// be aligned. +/// \returns A 128-bit vector of [2 x double] containing the moved values. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_loadl_pd(__m128d __a, double const *__dp) +{ + struct __mm_loadl_pd_struct { + double __u; + } __attribute__((__packed__, __may_alias__)); + double __u = ((const struct __mm_loadl_pd_struct*)__dp)->__u; + return __extension__ (__m128d){ __u, __a[1] }; +} + +/// Constructs a 128-bit floating-point vector of [2 x double] with +/// unspecified content. This could be used as an argument to another +/// intrinsic function where the argument is required but the value is not +/// actually used. +/// +/// \headerfile +/// +/// This intrinsic has no corresponding instruction. +/// +/// \returns A 128-bit floating-point vector of [2 x double] with unspecified +/// content. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_undefined_pd(void) +{ + return (__m128d)__builtin_ia32_undef128(); +} + +/// Constructs a 128-bit floating-point vector of [2 x double]. The lower +/// 64 bits of the vector are initialized with the specified double-precision +/// floating-point value. The upper 64 bits are set to zero. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVQ / MOVQ instruction. +/// +/// \param __w +/// A double-precision floating-point value used to initialize the lower 64 +/// bits of the result. +/// \returns An initialized 128-bit floating-point vector of [2 x double]. The +/// lower 64 bits contain the value of the parameter. The upper 64 bits are +/// set to zero. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_set_sd(double __w) +{ + return __extension__ (__m128d){ __w, 0 }; +} + +/// Constructs a 128-bit floating-point vector of [2 x double], with each +/// of the two double-precision floating-point vector elements set to the +/// specified double-precision floating-point value. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVDDUP / MOVLHPS instruction. +/// +/// \param __w +/// A double-precision floating-point value used to initialize each vector +/// element of the result. +/// \returns An initialized 128-bit floating-point vector of [2 x double]. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_set1_pd(double __w) +{ + return __extension__ (__m128d){ __w, __w }; +} + +/// Constructs a 128-bit floating-point vector of [2 x double], with each +/// of the two double-precision floating-point vector elements set to the +/// specified double-precision floating-point value. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVDDUP / MOVLHPS instruction. +/// +/// \param __w +/// A double-precision floating-point value used to initialize each vector +/// element of the result. +/// \returns An initialized 128-bit floating-point vector of [2 x double]. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_set_pd1(double __w) +{ + return _mm_set1_pd(__w); +} + +/// Constructs a 128-bit floating-point vector of [2 x double] +/// initialized with the specified double-precision floating-point values. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VUNPCKLPD / UNPCKLPD instruction. +/// +/// \param __w +/// A double-precision floating-point value used to initialize the upper 64 +/// bits of the result. +/// \param __x +/// A double-precision floating-point value used to initialize the lower 64 +/// bits of the result. +/// \returns An initialized 128-bit floating-point vector of [2 x double]. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_set_pd(double __w, double __x) +{ + return __extension__ (__m128d){ __x, __w }; +} + +/// Constructs a 128-bit floating-point vector of [2 x double], +/// initialized in reverse order with the specified double-precision +/// floating-point values. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VUNPCKLPD / UNPCKLPD instruction. +/// +/// \param __w +/// A double-precision floating-point value used to initialize the lower 64 +/// bits of the result. +/// \param __x +/// A double-precision floating-point value used to initialize the upper 64 +/// bits of the result. +/// \returns An initialized 128-bit floating-point vector of [2 x double]. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_setr_pd(double __w, double __x) +{ + return __extension__ (__m128d){ __w, __x }; +} + +/// Constructs a 128-bit floating-point vector of [2 x double] +/// initialized to zero. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VXORPS / XORPS instruction. +/// +/// \returns An initialized 128-bit floating-point vector of [2 x double] with +/// all elements set to zero. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_setzero_pd(void) +{ + return __extension__ (__m128d){ 0, 0 }; +} + +/// Constructs a 128-bit floating-point vector of [2 x double]. The lower +/// 64 bits are set to the lower 64 bits of the second parameter. The upper +/// 64 bits are set to the upper 64 bits of the first parameter. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VBLENDPD / BLENDPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The upper 64 bits are written to the +/// upper 64 bits of the result. +/// \param __b +/// A 128-bit vector of [2 x double]. The lower 64 bits are written to the +/// lower 64 bits of the result. +/// \returns A 128-bit vector of [2 x double] containing the moved values. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_move_sd(__m128d __a, __m128d __b) +{ + __a[0] = __b[0]; + return __a; +} + +/// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a +/// memory location. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVSD / MOVSD instruction. +/// +/// \param __dp +/// A pointer to a 64-bit memory location. +/// \param __a +/// A 128-bit vector of [2 x double] containing the value to be stored. +static __inline__ void __DEFAULT_FN_ATTRS +_mm_store_sd(double *__dp, __m128d __a) +{ + struct __mm_store_sd_struct { + double __u; + } __attribute__((__packed__, __may_alias__)); + ((struct __mm_store_sd_struct*)__dp)->__u = __a[0]; +} + +/// Moves packed double-precision values from a 128-bit vector of +/// [2 x double] to a memory location. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVAPD / MOVAPS instruction. +/// +/// \param __dp +/// A pointer to an aligned memory location that can store two +/// double-precision values. +/// \param __a +/// A packed 128-bit vector of [2 x double] containing the values to be +/// moved. +static __inline__ void __DEFAULT_FN_ATTRS +_mm_store_pd(double *__dp, __m128d __a) +{ + *(__m128d*)__dp = __a; +} + +/// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to +/// the upper and lower 64 bits of a memory location. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the +/// VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS instruction. +/// +/// \param __dp +/// A pointer to a memory location that can store two double-precision +/// values. +/// \param __a +/// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each +/// of the values in \a __dp. +static __inline__ void __DEFAULT_FN_ATTRS +_mm_store1_pd(double *__dp, __m128d __a) +{ + __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0); + _mm_store_pd(__dp, __a); +} + +/// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to +/// the upper and lower 64 bits of a memory location. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the +/// VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS instruction. +/// +/// \param __dp +/// A pointer to a memory location that can store two double-precision +/// values. +/// \param __a +/// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each +/// of the values in \a __dp. +static __inline__ void __DEFAULT_FN_ATTRS +_mm_store_pd1(double *__dp, __m128d __a) +{ + _mm_store1_pd(__dp, __a); +} + +/// Stores a 128-bit vector of [2 x double] into an unaligned memory +/// location. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVUPD / MOVUPD instruction. +/// +/// \param __dp +/// A pointer to a 128-bit memory location. The address of the memory +/// location does not have to be aligned. +/// \param __a +/// A 128-bit vector of [2 x double] containing the values to be stored. +static __inline__ void __DEFAULT_FN_ATTRS +_mm_storeu_pd(double *__dp, __m128d __a) +{ + struct __storeu_pd { + __m128d_u __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_pd*)__dp)->__v = __a; +} + +/// Stores two double-precision values, in reverse order, from a 128-bit +/// vector of [2 x double] to a 16-byte aligned memory location. +/// +/// \headerfile +/// +/// This intrinsic corresponds to a shuffling instruction followed by a +/// VMOVAPD / MOVAPD instruction. +/// +/// \param __dp +/// A pointer to a 16-byte aligned memory location that can store two +/// double-precision values. +/// \param __a +/// A 128-bit vector of [2 x double] containing the values to be reversed and +/// stored. +static __inline__ void __DEFAULT_FN_ATTRS +_mm_storer_pd(double *__dp, __m128d __a) +{ + __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0); + *(__m128d *)__dp = __a; +} + +/// Stores the upper 64 bits of a 128-bit vector of [2 x double] to a +/// memory location. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVHPD / MOVHPD instruction. +/// +/// \param __dp +/// A pointer to a 64-bit memory location. +/// \param __a +/// A 128-bit vector of [2 x double] containing the value to be stored. +static __inline__ void __DEFAULT_FN_ATTRS +_mm_storeh_pd(double *__dp, __m128d __a) +{ + struct __mm_storeh_pd_struct { + double __u; + } __attribute__((__packed__, __may_alias__)); + ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1]; +} + +/// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a +/// memory location. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVLPD / MOVLPD instruction. +/// +/// \param __dp +/// A pointer to a 64-bit memory location. +/// \param __a +/// A 128-bit vector of [2 x double] containing the value to be stored. +static __inline__ void __DEFAULT_FN_ATTRS +_mm_storel_pd(double *__dp, __m128d __a) +{ + struct __mm_storeh_pd_struct { + double __u; + } __attribute__((__packed__, __may_alias__)); + ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0]; +} + +/// Adds the corresponding elements of two 128-bit vectors of [16 x i8], +/// saving the lower 8 bits of each sum in the corresponding element of a +/// 128-bit result vector of [16 x i8]. +/// +/// The integer elements of both parameters can be either signed or unsigned. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPADDB / PADDB instruction. +/// +/// \param __a +/// A 128-bit vector of [16 x i8]. +/// \param __b +/// A 128-bit vector of [16 x i8]. +/// \returns A 128-bit vector of [16 x i8] containing the sums of both +/// parameters. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_add_epi8(__m128i __a, __m128i __b) +{ + return (__m128i)((__v16qu)__a + (__v16qu)__b); +} + +/// Adds the corresponding elements of two 128-bit vectors of [8 x i16], +/// saving the lower 16 bits of each sum in the corresponding element of a +/// 128-bit result vector of [8 x i16]. +/// +/// The integer elements of both parameters can be either signed or unsigned. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPADDW / PADDW instruction. +/// +/// \param __a +/// A 128-bit vector of [8 x i16]. +/// \param __b +/// A 128-bit vector of [8 x i16]. +/// \returns A 128-bit vector of [8 x i16] containing the sums of both +/// parameters. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_add_epi16(__m128i __a, __m128i __b) +{ + return (__m128i)((__v8hu)__a + (__v8hu)__b); +} + +/// Adds the corresponding elements of two 128-bit vectors of [4 x i32], +/// saving the lower 32 bits of each sum in the corresponding element of a +/// 128-bit result vector of [4 x i32]. +/// +/// The integer elements of both parameters can be either signed or unsigned. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPADDD / PADDD instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x i32]. +/// \param __b +/// A 128-bit vector of [4 x i32]. +/// \returns A 128-bit vector of [4 x i32] containing the sums of both +/// parameters. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_add_epi32(__m128i __a, __m128i __b) +{ + return (__m128i)((__v4su)__a + (__v4su)__b); +} + +/// Adds two signed or unsigned 64-bit integer values, returning the +/// lower 64 bits of the sum. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PADDQ instruction. +/// +/// \param __a +/// A 64-bit integer. +/// \param __b +/// A 64-bit integer. +/// \returns A 64-bit integer containing the sum of both parameters. +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +_mm_add_si64(__m64 __a, __m64 __b) +{ + return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b); +} + +/// Adds the corresponding elements of two 128-bit vectors of [2 x i64], +/// saving the lower 64 bits of each sum in the corresponding element of a +/// 128-bit result vector of [2 x i64]. +/// +/// The integer elements of both parameters can be either signed or unsigned. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPADDQ / PADDQ instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x i64]. +/// \param __b +/// A 128-bit vector of [2 x i64]. +/// \returns A 128-bit vector of [2 x i64] containing the sums of both +/// parameters. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_add_epi64(__m128i __a, __m128i __b) +{ + return (__m128i)((__v2du)__a + (__v2du)__b); +} + +/// Adds, with saturation, the corresponding elements of two 128-bit +/// signed [16 x i8] vectors, saving each sum in the corresponding element of +/// a 128-bit result vector of [16 x i8]. Positive sums greater than 0x7F are +/// saturated to 0x7F. Negative sums less than 0x80 are saturated to 0x80. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPADDSB / PADDSB instruction. +/// +/// \param __a +/// A 128-bit signed [16 x i8] vector. +/// \param __b +/// A 128-bit signed [16 x i8] vector. +/// \returns A 128-bit signed [16 x i8] vector containing the saturated sums of +/// both parameters. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_adds_epi8(__m128i __a, __m128i __b) +{ +#if (__clang_major__ > 14) + return (__m128i)__builtin_elementwise_add_sat((__v16qs)__a, (__v16qs)__b); +#else + return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b); +#endif +} + +/// Adds, with saturation, the corresponding elements of two 128-bit +/// signed [8 x i16] vectors, saving each sum in the corresponding element of +/// a 128-bit result vector of [8 x i16]. Positive sums greater than 0x7FFF +/// are saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to +/// 0x8000. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPADDSW / PADDSW instruction. +/// +/// \param __a +/// A 128-bit signed [8 x i16] vector. +/// \param __b +/// A 128-bit signed [8 x i16] vector. +/// \returns A 128-bit signed [8 x i16] vector containing the saturated sums of +/// both parameters. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_adds_epi16(__m128i __a, __m128i __b) +{ +#if (__clang_major__ > 14) + return (__m128i)__builtin_elementwise_add_sat((__v8hi)__a, (__v8hi)__b); +#else + return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b); +#endif +} + +/// Adds, with saturation, the corresponding elements of two 128-bit +/// unsigned [16 x i8] vectors, saving each sum in the corresponding element +/// of a 128-bit result vector of [16 x i8]. Positive sums greater than 0xFF +/// are saturated to 0xFF. Negative sums are saturated to 0x00. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPADDUSB / PADDUSB instruction. +/// +/// \param __a +/// A 128-bit unsigned [16 x i8] vector. +/// \param __b +/// A 128-bit unsigned [16 x i8] vector. +/// \returns A 128-bit unsigned [16 x i8] vector containing the saturated sums +/// of both parameters. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_adds_epu8(__m128i __a, __m128i __b) +{ +#if (__clang_major__ > 14) + return (__m128i)__builtin_elementwise_add_sat((__v16qu)__a, (__v16qu)__b); +#else + return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b); +#endif +} + +/// Adds, with saturation, the corresponding elements of two 128-bit +/// unsigned [8 x i16] vectors, saving each sum in the corresponding element +/// of a 128-bit result vector of [8 x i16]. Positive sums greater than +/// 0xFFFF are saturated to 0xFFFF. Negative sums are saturated to 0x0000. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPADDUSB / PADDUSB instruction. +/// +/// \param __a +/// A 128-bit unsigned [8 x i16] vector. +/// \param __b +/// A 128-bit unsigned [8 x i16] vector. +/// \returns A 128-bit unsigned [8 x i16] vector containing the saturated sums +/// of both parameters. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_adds_epu16(__m128i __a, __m128i __b) +{ +#if (__clang_major__ > 14) + return (__m128i)__builtin_elementwise_add_sat((__v8hu)__a, (__v8hu)__b); +#else + return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b); +#endif +} + +/// Computes the rounded averages of corresponding elements of two +/// 128-bit unsigned [16 x i8] vectors, saving each result in the +/// corresponding element of a 128-bit result vector of [16 x i8]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPAVGB / PAVGB instruction. +/// +/// \param __a +/// A 128-bit unsigned [16 x i8] vector. +/// \param __b +/// A 128-bit unsigned [16 x i8] vector. +/// \returns A 128-bit unsigned [16 x i8] vector containing the rounded +/// averages of both parameters. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_avg_epu8(__m128i __a, __m128i __b) +{ + return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b); +} + +/// Computes the rounded averages of corresponding elements of two +/// 128-bit unsigned [8 x i16] vectors, saving each result in the +/// corresponding element of a 128-bit result vector of [8 x i16]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPAVGW / PAVGW instruction. +/// +/// \param __a +/// A 128-bit unsigned [8 x i16] vector. +/// \param __b +/// A 128-bit unsigned [8 x i16] vector. +/// \returns A 128-bit unsigned [8 x i16] vector containing the rounded +/// averages of both parameters. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_avg_epu16(__m128i __a, __m128i __b) +{ + return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b); +} + +/// Multiplies the corresponding elements of two 128-bit signed [8 x i16] +/// vectors, producing eight intermediate 32-bit signed integer products, and +/// adds the consecutive pairs of 32-bit products to form a 128-bit signed +/// [4 x i32] vector. +/// +/// For example, bits [15:0] of both parameters are multiplied producing a +/// 32-bit product, bits [31:16] of both parameters are multiplied producing +/// a 32-bit product, and the sum of those two products becomes bits [31:0] +/// of the result. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPMADDWD / PMADDWD instruction. +/// +/// \param __a +/// A 128-bit signed [8 x i16] vector. +/// \param __b +/// A 128-bit signed [8 x i16] vector. +/// \returns A 128-bit signed [4 x i32] vector containing the sums of products +/// of both parameters. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_madd_epi16(__m128i __a, __m128i __b) +{ + return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b); +} + +/// Compares corresponding elements of two 128-bit signed [8 x i16] +/// vectors, saving the greater value from each comparison in the +/// corresponding element of a 128-bit result vector of [8 x i16]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPMAXSW / PMAXSW instruction. +/// +/// \param __a +/// A 128-bit signed [8 x i16] vector. +/// \param __b +/// A 128-bit signed [8 x i16] vector. +/// \returns A 128-bit signed [8 x i16] vector containing the greater value of +/// each comparison. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_max_epi16(__m128i __a, __m128i __b) +{ +#if (__clang_major__ < 14) + return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b); +#else + return (__m128i)__builtin_elementwise_max((__v8hi)__a, (__v8hi)__b); +#endif +} + +/// Compares corresponding elements of two 128-bit unsigned [16 x i8] +/// vectors, saving the greater value from each comparison in the +/// corresponding element of a 128-bit result vector of [16 x i8]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPMAXUB / PMAXUB instruction. +/// +/// \param __a +/// A 128-bit unsigned [16 x i8] vector. +/// \param __b +/// A 128-bit unsigned [16 x i8] vector. +/// \returns A 128-bit unsigned [16 x i8] vector containing the greater value of +/// each comparison. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_max_epu8(__m128i __a, __m128i __b) +{ +#if (__clang_major__ < 14) + return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b); +#else + return (__m128i)__builtin_elementwise_max((__v16qu)__a, (__v16qu)__b); +#endif +} + +/// Compares corresponding elements of two 128-bit signed [8 x i16] +/// vectors, saving the smaller value from each comparison in the +/// corresponding element of a 128-bit result vector of [8 x i16]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPMINSW / PMINSW instruction. +/// +/// \param __a +/// A 128-bit signed [8 x i16] vector. +/// \param __b +/// A 128-bit signed [8 x i16] vector. +/// \returns A 128-bit signed [8 x i16] vector containing the smaller value of +/// each comparison. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_min_epi16(__m128i __a, __m128i __b) +{ +#if (__clang_major__ < 14) + return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b); +#else + return (__m128i)__builtin_elementwise_min((__v8hi)__a, (__v8hi)__b); +#endif +} + +/// Compares corresponding elements of two 128-bit unsigned [16 x i8] +/// vectors, saving the smaller value from each comparison in the +/// corresponding element of a 128-bit result vector of [16 x i8]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPMINUB / PMINUB instruction. +/// +/// \param __a +/// A 128-bit unsigned [16 x i8] vector. +/// \param __b +/// A 128-bit unsigned [16 x i8] vector. +/// \returns A 128-bit unsigned [16 x i8] vector containing the smaller value of +/// each comparison. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_min_epu8(__m128i __a, __m128i __b) +{ +#if (__clang_major__ < 14) + return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b); +#else + return (__m128i)__builtin_elementwise_min((__v16qu)__a, (__v16qu)__b); +#endif +} + +/// Multiplies the corresponding elements of two signed [8 x i16] +/// vectors, saving the upper 16 bits of each 32-bit product in the +/// corresponding element of a 128-bit signed [8 x i16] result vector. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPMULHW / PMULHW instruction. +/// +/// \param __a +/// A 128-bit signed [8 x i16] vector. +/// \param __b +/// A 128-bit signed [8 x i16] vector. +/// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of +/// each of the eight 32-bit products. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_mulhi_epi16(__m128i __a, __m128i __b) +{ + return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b); +} + +/// Multiplies the corresponding elements of two unsigned [8 x i16] +/// vectors, saving the upper 16 bits of each 32-bit product in the +/// corresponding element of a 128-bit unsigned [8 x i16] result vector. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPMULHUW / PMULHUW instruction. +/// +/// \param __a +/// A 128-bit unsigned [8 x i16] vector. +/// \param __b +/// A 128-bit unsigned [8 x i16] vector. +/// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits +/// of each of the eight 32-bit products. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_mulhi_epu16(__m128i __a, __m128i __b) +{ + return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b); +} + +/// Multiplies the corresponding elements of two signed [8 x i16] +/// vectors, saving the lower 16 bits of each 32-bit product in the +/// corresponding element of a 128-bit signed [8 x i16] result vector. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPMULLW / PMULLW instruction. +/// +/// \param __a +/// A 128-bit signed [8 x i16] vector. +/// \param __b +/// A 128-bit signed [8 x i16] vector. +/// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of +/// each of the eight 32-bit products. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_mullo_epi16(__m128i __a, __m128i __b) +{ + return (__m128i)((__v8hu)__a * (__v8hu)__b); +} + +/// Multiplies 32-bit unsigned integer values contained in the lower bits +/// of the two 64-bit integer vectors and returns the 64-bit unsigned +/// product. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PMULUDQ instruction. +/// +/// \param __a +/// A 64-bit integer containing one of the source operands. +/// \param __b +/// A 64-bit integer containing one of the source operands. +/// \returns A 64-bit integer vector containing the product of both operands. +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +_mm_mul_su32(__m64 __a, __m64 __b) +{ + return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b); +} + +/// Multiplies 32-bit unsigned integer values contained in the lower +/// bits of the corresponding elements of two [2 x i64] vectors, and returns +/// the 64-bit products in the corresponding elements of a [2 x i64] vector. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPMULUDQ / PMULUDQ instruction. +/// +/// \param __a +/// A [2 x i64] vector containing one of the source operands. +/// \param __b +/// A [2 x i64] vector containing one of the source operands. +/// \returns A [2 x i64] vector containing the product of both operands. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_mul_epu32(__m128i __a, __m128i __b) +{ + return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b); +} + +/// Computes the absolute differences of corresponding 8-bit integer +/// values in two 128-bit vectors. Sums the first 8 absolute differences, and +/// separately sums the second 8 absolute differences. Packs these two +/// unsigned 16-bit integer sums into the upper and lower elements of a +/// [2 x i64] vector. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPSADBW / PSADBW instruction. +/// +/// \param __a +/// A 128-bit integer vector containing one of the source operands. +/// \param __b +/// A 128-bit integer vector containing one of the source operands. +/// \returns A [2 x i64] vector containing the sums of the sets of absolute +/// differences between both operands. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_sad_epu8(__m128i __a, __m128i __b) +{ + return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b); +} + +/// Subtracts the corresponding 8-bit integer values in the operands. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPSUBB / PSUBB instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the minuends. +/// \param __b +/// A 128-bit integer vector containing the subtrahends. +/// \returns A 128-bit integer vector containing the differences of the values +/// in the operands. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_sub_epi8(__m128i __a, __m128i __b) +{ + return (__m128i)((__v16qu)__a - (__v16qu)__b); +} + +/// Subtracts the corresponding 16-bit integer values in the operands. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPSUBW / PSUBW instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the minuends. +/// \param __b +/// A 128-bit integer vector containing the subtrahends. +/// \returns A 128-bit integer vector containing the differences of the values +/// in the operands. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_sub_epi16(__m128i __a, __m128i __b) +{ + return (__m128i)((__v8hu)__a - (__v8hu)__b); +} + +/// Subtracts the corresponding 32-bit integer values in the operands. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPSUBD / PSUBD instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the minuends. +/// \param __b +/// A 128-bit integer vector containing the subtrahends. +/// \returns A 128-bit integer vector containing the differences of the values +/// in the operands. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_sub_epi32(__m128i __a, __m128i __b) +{ + return (__m128i)((__v4su)__a - (__v4su)__b); +} + +/// Subtracts signed or unsigned 64-bit integer values and writes the +/// difference to the corresponding bits in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PSUBQ instruction. +/// +/// \param __a +/// A 64-bit integer vector containing the minuend. +/// \param __b +/// A 64-bit integer vector containing the subtrahend. +/// \returns A 64-bit integer vector containing the difference of the values in +/// the operands. +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +_mm_sub_si64(__m64 __a, __m64 __b) +{ + return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b); +} + +/// Subtracts the corresponding elements of two [2 x i64] vectors. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPSUBQ / PSUBQ instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the minuends. +/// \param __b +/// A 128-bit integer vector containing the subtrahends. +/// \returns A 128-bit integer vector containing the differences of the values +/// in the operands. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_sub_epi64(__m128i __a, __m128i __b) +{ + return (__m128i)((__v2du)__a - (__v2du)__b); +} + +/// Subtracts corresponding 8-bit signed integer values in the input and +/// returns the differences in the corresponding bytes in the destination. +/// Differences greater than 0x7F are saturated to 0x7F, and differences less +/// than 0x80 are saturated to 0x80. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPSUBSB / PSUBSB instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the minuends. +/// \param __b +/// A 128-bit integer vector containing the subtrahends. +/// \returns A 128-bit integer vector containing the differences of the values +/// in the operands. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_subs_epi8(__m128i __a, __m128i __b) +{ +#if (__clang_major__ > 14) + return (__m128i)__builtin_elementwise_sub_sat((__v16qs)__a, (__v16qs)__b); +#else + return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b); +#endif +} + +/// Subtracts corresponding 16-bit signed integer values in the input and +/// returns the differences in the corresponding bytes in the destination. +/// Differences greater than 0x7FFF are saturated to 0x7FFF, and values less +/// than 0x8000 are saturated to 0x8000. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPSUBSW / PSUBSW instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the minuends. +/// \param __b +/// A 128-bit integer vector containing the subtrahends. +/// \returns A 128-bit integer vector containing the differences of the values +/// in the operands. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_subs_epi16(__m128i __a, __m128i __b) +{ +#if (__clang_major__ > 14) + return (__m128i)__builtin_elementwise_sub_sat((__v8hi)__a, (__v8hi)__b); +#else + return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b); +#endif +} + +/// Subtracts corresponding 8-bit unsigned integer values in the input +/// and returns the differences in the corresponding bytes in the +/// destination. Differences less than 0x00 are saturated to 0x00. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPSUBUSB / PSUBUSB instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the minuends. +/// \param __b +/// A 128-bit integer vector containing the subtrahends. +/// \returns A 128-bit integer vector containing the unsigned integer +/// differences of the values in the operands. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_subs_epu8(__m128i __a, __m128i __b) +{ +#if (__clang_major__ > 14) + return (__m128i)__builtin_elementwise_sub_sat((__v16qu)__a, (__v16qu)__b); +#else + return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b); +#endif +} + +/// Subtracts corresponding 16-bit unsigned integer values in the input +/// and returns the differences in the corresponding bytes in the +/// destination. Differences less than 0x0000 are saturated to 0x0000. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPSUBUSW / PSUBUSW instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the minuends. +/// \param __b +/// A 128-bit integer vector containing the subtrahends. +/// \returns A 128-bit integer vector containing the unsigned integer +/// differences of the values in the operands. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_subs_epu16(__m128i __a, __m128i __b) +{ +#if (__clang_major__ > 14) + return (__m128i)__builtin_elementwise_sub_sat((__v8hu)__a, (__v8hu)__b); +#else + return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b); +#endif +} + +/// Performs a bitwise AND of two 128-bit integer vectors. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPAND / PAND instruction. +/// +/// \param __a +/// A 128-bit integer vector containing one of the source operands. +/// \param __b +/// A 128-bit integer vector containing one of the source operands. +/// \returns A 128-bit integer vector containing the bitwise AND of the values +/// in both operands. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_and_si128(__m128i __a, __m128i __b) +{ + return (__m128i)((__v2du)__a & (__v2du)__b); +} + +/// Performs a bitwise AND of two 128-bit integer vectors, using the +/// one's complement of the values contained in the first source operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPANDN / PANDN instruction. +/// +/// \param __a +/// A 128-bit vector containing the left source operand. The one's complement +/// of this value is used in the bitwise AND. +/// \param __b +/// A 128-bit vector containing the right source operand. +/// \returns A 128-bit integer vector containing the bitwise AND of the one's +/// complement of the first operand and the values in the second operand. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_andnot_si128(__m128i __a, __m128i __b) +{ + return (__m128i)(~(__v2du)__a & (__v2du)__b); +} +/// Performs a bitwise OR of two 128-bit integer vectors. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPOR / POR instruction. +/// +/// \param __a +/// A 128-bit integer vector containing one of the source operands. +/// \param __b +/// A 128-bit integer vector containing one of the source operands. +/// \returns A 128-bit integer vector containing the bitwise OR of the values +/// in both operands. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_or_si128(__m128i __a, __m128i __b) +{ + return (__m128i)((__v2du)__a | (__v2du)__b); +} + +/// Performs a bitwise exclusive OR of two 128-bit integer vectors. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPXOR / PXOR instruction. +/// +/// \param __a +/// A 128-bit integer vector containing one of the source operands. +/// \param __b +/// A 128-bit integer vector containing one of the source operands. +/// \returns A 128-bit integer vector containing the bitwise exclusive OR of the +/// values in both operands. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_xor_si128(__m128i __a, __m128i __b) +{ + return (__m128i)((__v2du)__a ^ (__v2du)__b); +} + +/// Left-shifts the 128-bit integer vector operand by the specified +/// number of bytes. Low-order bits are cleared. +/// +/// \headerfile +/// +/// \code +/// __m128i _mm_slli_si128(__m128i a, const int imm); +/// \endcode +/// +/// This intrinsic corresponds to the VPSLLDQ / PSLLDQ instruction. +/// +/// \param a +/// A 128-bit integer vector containing the source operand. +/// \param imm +/// An immediate value specifying the number of bytes to left-shift operand +/// \a a. +/// \returns A 128-bit integer vector containing the left-shifted value. +#define _mm_slli_si128(a, imm) \ + ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))) + +#define _mm_bslli_si128(a, imm) \ + ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))) + +/// Left-shifts each 16-bit value in the 128-bit integer vector operand +/// by the specified number of bits. Low-order bits are cleared. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPSLLW / PSLLW instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the source operand. +/// \param __count +/// An integer value specifying the number of bits to left-shift each value +/// in operand \a __a. +/// \returns A 128-bit integer vector containing the left-shifted values. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_slli_epi16(__m128i __a, int __count) +{ + return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count); +} + +/// Left-shifts each 16-bit value in the 128-bit integer vector operand +/// by the specified number of bits. Low-order bits are cleared. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPSLLW / PSLLW instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the source operand. +/// \param __count +/// A 128-bit integer vector in which bits [63:0] specify the number of bits +/// to left-shift each value in operand \a __a. +/// \returns A 128-bit integer vector containing the left-shifted values. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_sll_epi16(__m128i __a, __m128i __count) +{ + return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count); +} + +/// Left-shifts each 32-bit value in the 128-bit integer vector operand +/// by the specified number of bits. Low-order bits are cleared. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPSLLD / PSLLD instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the source operand. +/// \param __count +/// An integer value specifying the number of bits to left-shift each value +/// in operand \a __a. +/// \returns A 128-bit integer vector containing the left-shifted values. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_slli_epi32(__m128i __a, int __count) +{ + return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count); +} + +/// Left-shifts each 32-bit value in the 128-bit integer vector operand +/// by the specified number of bits. Low-order bits are cleared. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPSLLD / PSLLD instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the source operand. +/// \param __count +/// A 128-bit integer vector in which bits [63:0] specify the number of bits +/// to left-shift each value in operand \a __a. +/// \returns A 128-bit integer vector containing the left-shifted values. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_sll_epi32(__m128i __a, __m128i __count) +{ + return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count); +} + +/// Left-shifts each 64-bit value in the 128-bit integer vector operand +/// by the specified number of bits. Low-order bits are cleared. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPSLLQ / PSLLQ instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the source operand. +/// \param __count +/// An integer value specifying the number of bits to left-shift each value +/// in operand \a __a. +/// \returns A 128-bit integer vector containing the left-shifted values. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_slli_epi64(__m128i __a, int __count) +{ + return __builtin_ia32_psllqi128((__v2di)__a, __count); +} + +/// Left-shifts each 64-bit value in the 128-bit integer vector operand +/// by the specified number of bits. Low-order bits are cleared. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPSLLQ / PSLLQ instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the source operand. +/// \param __count +/// A 128-bit integer vector in which bits [63:0] specify the number of bits +/// to left-shift each value in operand \a __a. +/// \returns A 128-bit integer vector containing the left-shifted values. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_sll_epi64(__m128i __a, __m128i __count) +{ + return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count); +} + +/// Right-shifts each 16-bit value in the 128-bit integer vector operand +/// by the specified number of bits. High-order bits are filled with the sign +/// bit of the initial value. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPSRAW / PSRAW instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the source operand. +/// \param __count +/// An integer value specifying the number of bits to right-shift each value +/// in operand \a __a. +/// \returns A 128-bit integer vector containing the right-shifted values. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_srai_epi16(__m128i __a, int __count) +{ + return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count); +} + +/// Right-shifts each 16-bit value in the 128-bit integer vector operand +/// by the specified number of bits. High-order bits are filled with the sign +/// bit of the initial value. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPSRAW / PSRAW instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the source operand. +/// \param __count +/// A 128-bit integer vector in which bits [63:0] specify the number of bits +/// to right-shift each value in operand \a __a. +/// \returns A 128-bit integer vector containing the right-shifted values. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_sra_epi16(__m128i __a, __m128i __count) +{ + return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count); +} + +/// Right-shifts each 32-bit value in the 128-bit integer vector operand +/// by the specified number of bits. High-order bits are filled with the sign +/// bit of the initial value. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPSRAD / PSRAD instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the source operand. +/// \param __count +/// An integer value specifying the number of bits to right-shift each value +/// in operand \a __a. +/// \returns A 128-bit integer vector containing the right-shifted values. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_srai_epi32(__m128i __a, int __count) +{ + return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count); +} + +/// Right-shifts each 32-bit value in the 128-bit integer vector operand +/// by the specified number of bits. High-order bits are filled with the sign +/// bit of the initial value. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPSRAD / PSRAD instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the source operand. +/// \param __count +/// A 128-bit integer vector in which bits [63:0] specify the number of bits +/// to right-shift each value in operand \a __a. +/// \returns A 128-bit integer vector containing the right-shifted values. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_sra_epi32(__m128i __a, __m128i __count) +{ + return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count); +} + +/// Right-shifts the 128-bit integer vector operand by the specified +/// number of bytes. High-order bits are cleared. +/// +/// \headerfile +/// +/// \code +/// __m128i _mm_srli_si128(__m128i a, const int imm); +/// \endcode +/// +/// This intrinsic corresponds to the VPSRLDQ / PSRLDQ instruction. +/// +/// \param a +/// A 128-bit integer vector containing the source operand. +/// \param imm +/// An immediate value specifying the number of bytes to right-shift operand +/// \a a. +/// \returns A 128-bit integer vector containing the right-shifted value. +#define _mm_srli_si128(a, imm) \ + ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))) + +#define _mm_bsrli_si128(a, imm) \ + ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))) + +/// Right-shifts each of 16-bit values in the 128-bit integer vector +/// operand by the specified number of bits. High-order bits are cleared. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPSRLW / PSRLW instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the source operand. +/// \param __count +/// An integer value specifying the number of bits to right-shift each value +/// in operand \a __a. +/// \returns A 128-bit integer vector containing the right-shifted values. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_srli_epi16(__m128i __a, int __count) +{ + return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count); +} + +/// Right-shifts each of 16-bit values in the 128-bit integer vector +/// operand by the specified number of bits. High-order bits are cleared. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPSRLW / PSRLW instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the source operand. +/// \param __count +/// A 128-bit integer vector in which bits [63:0] specify the number of bits +/// to right-shift each value in operand \a __a. +/// \returns A 128-bit integer vector containing the right-shifted values. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_srl_epi16(__m128i __a, __m128i __count) +{ + return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count); +} + +/// Right-shifts each of 32-bit values in the 128-bit integer vector +/// operand by the specified number of bits. High-order bits are cleared. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPSRLD / PSRLD instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the source operand. +/// \param __count +/// An integer value specifying the number of bits to right-shift each value +/// in operand \a __a. +/// \returns A 128-bit integer vector containing the right-shifted values. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_srli_epi32(__m128i __a, int __count) +{ + return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count); +} + +/// Right-shifts each of 32-bit values in the 128-bit integer vector +/// operand by the specified number of bits. High-order bits are cleared. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPSRLD / PSRLD instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the source operand. +/// \param __count +/// A 128-bit integer vector in which bits [63:0] specify the number of bits +/// to right-shift each value in operand \a __a. +/// \returns A 128-bit integer vector containing the right-shifted values. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_srl_epi32(__m128i __a, __m128i __count) +{ + return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count); +} + +/// Right-shifts each of 64-bit values in the 128-bit integer vector +/// operand by the specified number of bits. High-order bits are cleared. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPSRLQ / PSRLQ instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the source operand. +/// \param __count +/// An integer value specifying the number of bits to right-shift each value +/// in operand \a __a. +/// \returns A 128-bit integer vector containing the right-shifted values. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_srli_epi64(__m128i __a, int __count) +{ + return __builtin_ia32_psrlqi128((__v2di)__a, __count); +} + +/// Right-shifts each of 64-bit values in the 128-bit integer vector +/// operand by the specified number of bits. High-order bits are cleared. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPSRLQ / PSRLQ instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the source operand. +/// \param __count +/// A 128-bit integer vector in which bits [63:0] specify the number of bits +/// to right-shift each value in operand \a __a. +/// \returns A 128-bit integer vector containing the right-shifted values. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_srl_epi64(__m128i __a, __m128i __count) +{ + return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count); +} + +/// Compares each of the corresponding 8-bit values of the 128-bit +/// integer vectors for equality. Each comparison yields 0x0 for false, 0xFF +/// for true. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPCMPEQB / PCMPEQB instruction. +/// +/// \param __a +/// A 128-bit integer vector. +/// \param __b +/// A 128-bit integer vector. +/// \returns A 128-bit integer vector containing the comparison results. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_cmpeq_epi8(__m128i __a, __m128i __b) +{ + return (__m128i)((__v16qi)__a == (__v16qi)__b); +} + +/// Compares each of the corresponding 16-bit values of the 128-bit +/// integer vectors for equality. Each comparison yields 0x0 for false, +/// 0xFFFF for true. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPCMPEQW / PCMPEQW instruction. +/// +/// \param __a +/// A 128-bit integer vector. +/// \param __b +/// A 128-bit integer vector. +/// \returns A 128-bit integer vector containing the comparison results. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_cmpeq_epi16(__m128i __a, __m128i __b) +{ + return (__m128i)((__v8hi)__a == (__v8hi)__b); +} + +/// Compares each of the corresponding 32-bit values of the 128-bit +/// integer vectors for equality. Each comparison yields 0x0 for false, +/// 0xFFFFFFFF for true. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPCMPEQD / PCMPEQD instruction. +/// +/// \param __a +/// A 128-bit integer vector. +/// \param __b +/// A 128-bit integer vector. +/// \returns A 128-bit integer vector containing the comparison results. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_cmpeq_epi32(__m128i __a, __m128i __b) +{ + return (__m128i)((__v4si)__a == (__v4si)__b); +} + +/// Compares each of the corresponding signed 8-bit values of the 128-bit +/// integer vectors to determine if the values in the first operand are +/// greater than those in the second operand. Each comparison yields 0x0 for +/// false, 0xFF for true. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPCMPGTB / PCMPGTB instruction. +/// +/// \param __a +/// A 128-bit integer vector. +/// \param __b +/// A 128-bit integer vector. +/// \returns A 128-bit integer vector containing the comparison results. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_cmpgt_epi8(__m128i __a, __m128i __b) +{ + /* This function always performs a signed comparison, but __v16qi is a char + which may be signed or unsigned, so use __v16qs. */ + return (__m128i)((__v16qs)__a > (__v16qs)__b); +} + +/// Compares each of the corresponding signed 16-bit values of the +/// 128-bit integer vectors to determine if the values in the first operand +/// are greater than those in the second operand. +/// +/// Each comparison yields 0x0 for false, 0xFFFF for true. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPCMPGTW / PCMPGTW instruction. +/// +/// \param __a +/// A 128-bit integer vector. +/// \param __b +/// A 128-bit integer vector. +/// \returns A 128-bit integer vector containing the comparison results. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_cmpgt_epi16(__m128i __a, __m128i __b) +{ + return (__m128i)((__v8hi)__a > (__v8hi)__b); +} + +/// Compares each of the corresponding signed 32-bit values of the +/// 128-bit integer vectors to determine if the values in the first operand +/// are greater than those in the second operand. +/// +/// Each comparison yields 0x0 for false, 0xFFFFFFFF for true. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPCMPGTD / PCMPGTD instruction. +/// +/// \param __a +/// A 128-bit integer vector. +/// \param __b +/// A 128-bit integer vector. +/// \returns A 128-bit integer vector containing the comparison results. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_cmpgt_epi32(__m128i __a, __m128i __b) +{ + return (__m128i)((__v4si)__a > (__v4si)__b); +} + +/// Compares each of the corresponding signed 8-bit values of the 128-bit +/// integer vectors to determine if the values in the first operand are less +/// than those in the second operand. +/// +/// Each comparison yields 0x0 for false, 0xFF for true. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPCMPGTB / PCMPGTB instruction. +/// +/// \param __a +/// A 128-bit integer vector. +/// \param __b +/// A 128-bit integer vector. +/// \returns A 128-bit integer vector containing the comparison results. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_cmplt_epi8(__m128i __a, __m128i __b) +{ + return _mm_cmpgt_epi8(__b, __a); +} + +/// Compares each of the corresponding signed 16-bit values of the +/// 128-bit integer vectors to determine if the values in the first operand +/// are less than those in the second operand. +/// +/// Each comparison yields 0x0 for false, 0xFFFF for true. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPCMPGTW / PCMPGTW instruction. +/// +/// \param __a +/// A 128-bit integer vector. +/// \param __b +/// A 128-bit integer vector. +/// \returns A 128-bit integer vector containing the comparison results. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_cmplt_epi16(__m128i __a, __m128i __b) +{ + return _mm_cmpgt_epi16(__b, __a); +} + +/// Compares each of the corresponding signed 32-bit values of the +/// 128-bit integer vectors to determine if the values in the first operand +/// are less than those in the second operand. +/// +/// Each comparison yields 0x0 for false, 0xFFFFFFFF for true. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPCMPGTD / PCMPGTD instruction. +/// +/// \param __a +/// A 128-bit integer vector. +/// \param __b +/// A 128-bit integer vector. +/// \returns A 128-bit integer vector containing the comparison results. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_cmplt_epi32(__m128i __a, __m128i __b) +{ + return _mm_cmpgt_epi32(__b, __a); +} + +#ifdef __x86_64__ +/// Converts a 64-bit signed integer value from the second operand into a +/// double-precision value and returns it in the lower element of a [2 x +/// double] vector; the upper element of the returned vector is copied from +/// the upper element of the first operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTSI2SD / CVTSI2SD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The upper 64 bits of this operand are +/// copied to the upper 64 bits of the destination. +/// \param __b +/// A 64-bit signed integer operand containing the value to be converted. +/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the +/// converted value of the second operand. The upper 64 bits are copied from +/// the upper 64 bits of the first operand. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_cvtsi64_sd(__m128d __a, long long __b) +{ + __a[0] = __b; + return __a; +} + +/// Converts the first (lower) element of a vector of [2 x double] into a +/// 64-bit signed integer value, according to the current rounding mode. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTSD2SI / CVTSD2SI instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the +/// conversion. +/// \returns A 64-bit signed integer containing the converted value. +static __inline__ long long __DEFAULT_FN_ATTRS +_mm_cvtsd_si64(__m128d __a) +{ + return __builtin_ia32_cvtsd2si64((__v2df)__a); +} + +/// Converts the first (lower) element of a vector of [2 x double] into a +/// 64-bit signed integer value, truncating the result when it is inexact. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTTSD2SI / CVTTSD2SI +/// instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the +/// conversion. +/// \returns A 64-bit signed integer containing the converted value. +static __inline__ long long __DEFAULT_FN_ATTRS +_mm_cvttsd_si64(__m128d __a) +{ + return __builtin_ia32_cvttsd2si64((__v2df)__a); +} +#endif + +/// Converts a vector of [4 x i32] into a vector of [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTDQ2PS / CVTDQ2PS instruction. +/// +/// \param __a +/// A 128-bit integer vector. +/// \returns A 128-bit vector of [4 x float] containing the converted values. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_cvtepi32_ps(__m128i __a) +{ + return (__m128)__builtin_convertvector((__v4si)__a, __v4sf); +} + +/// Converts a vector of [4 x float] into a vector of [4 x i32]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTPS2DQ / CVTPS2DQ instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. +/// \returns A 128-bit integer vector of [4 x i32] containing the converted +/// values. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_cvtps_epi32(__m128 __a) +{ + return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a); +} + +/// Converts a vector of [4 x float] into a vector of [4 x i32], +/// truncating the result when it is inexact. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTTPS2DQ / CVTTPS2DQ +/// instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. +/// \returns A 128-bit vector of [4 x i32] containing the converted values. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_cvttps_epi32(__m128 __a) +{ + return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__a); +} + +/// Returns a vector of [4 x i32] where the lowest element is the input +/// operand and the remaining elements are zero. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVD / MOVD instruction. +/// +/// \param __a +/// A 32-bit signed integer operand. +/// \returns A 128-bit vector of [4 x i32]. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_cvtsi32_si128(int __a) +{ + return __extension__ (__m128i)(__v4si){ __a, 0, 0, 0 }; +} + +#ifdef __x86_64__ +/// Returns a vector of [2 x i64] where the lower element is the input +/// operand and the upper element is zero. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVQ / MOVQ instruction. +/// +/// \param __a +/// A 64-bit signed integer operand containing the value to be converted. +/// \returns A 128-bit vector of [2 x i64] containing the converted value. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_cvtsi64_si128(long long __a) +{ + return __extension__ (__m128i)(__v2di){ __a, 0 }; +} +#endif + +/// Moves the least significant 32 bits of a vector of [4 x i32] to a +/// 32-bit signed integer value. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVD / MOVD instruction. +/// +/// \param __a +/// A vector of [4 x i32]. The least significant 32 bits are moved to the +/// destination. +/// \returns A 32-bit signed integer containing the moved value. +static __inline__ int __DEFAULT_FN_ATTRS +_mm_cvtsi128_si32(__m128i __a) +{ + __v4si __b = (__v4si)__a; + return __b[0]; +} + +#ifdef __x86_64__ +/// Moves the least significant 64 bits of a vector of [2 x i64] to a +/// 64-bit signed integer value. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVQ / MOVQ instruction. +/// +/// \param __a +/// A vector of [2 x i64]. The least significant 64 bits are moved to the +/// destination. +/// \returns A 64-bit signed integer containing the moved value. +static __inline__ long long __DEFAULT_FN_ATTRS +_mm_cvtsi128_si64(__m128i __a) +{ + return __a[0]; +} +#endif + +/// Moves packed integer values from an aligned 128-bit memory location +/// to elements in a 128-bit integer vector. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVDQA / MOVDQA instruction. +/// +/// \param __p +/// An aligned pointer to a memory location containing integer values. +/// \returns A 128-bit integer vector containing the moved values. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_load_si128(__m128i const *__p) +{ + return *__p; +} + +/// Moves packed integer values from an unaligned 128-bit memory location +/// to elements in a 128-bit integer vector. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVDQU / MOVDQU instruction. +/// +/// \param __p +/// A pointer to a memory location containing integer values. +/// \returns A 128-bit integer vector containing the moved values. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_loadu_si128(__m128i_u const *__p) +{ + struct __loadu_si128 { + __m128i_u __v; + } __attribute__((__packed__, __may_alias__)); + return ((const struct __loadu_si128*)__p)->__v; +} + +/// Returns a vector of [2 x i64] where the lower element is taken from +/// the lower element of the operand, and the upper element is zero. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVQ / MOVQ instruction. +/// +/// \param __p +/// A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of +/// the destination. +/// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the +/// moved value. The higher order bits are cleared. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_loadl_epi64(__m128i_u const *__p) +{ + struct __mm_loadl_epi64_struct { + long long __u; + } __attribute__((__packed__, __may_alias__)); + return __extension__ (__m128i) { ((const struct __mm_loadl_epi64_struct*)__p)->__u, 0}; +} + +/// Generates a 128-bit vector of [4 x i32] with unspecified content. +/// This could be used as an argument to another intrinsic function where the +/// argument is required but the value is not actually used. +/// +/// \headerfile +/// +/// This intrinsic has no corresponding instruction. +/// +/// \returns A 128-bit vector of [4 x i32] with unspecified content. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_undefined_si128(void) +{ + return (__m128i)__builtin_ia32_undef128(); +} + +/// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with +/// the specified 64-bit integer values. +/// +/// \headerfile +/// +/// This intrinsic is a utility function and does not correspond to a specific +/// instruction. +/// +/// \param __q1 +/// A 64-bit integer value used to initialize the upper 64 bits of the +/// destination vector of [2 x i64]. +/// \param __q0 +/// A 64-bit integer value used to initialize the lower 64 bits of the +/// destination vector of [2 x i64]. +/// \returns An initialized 128-bit vector of [2 x i64] containing the values +/// provided in the operands. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_set_epi64x(long long __q1, long long __q0) +{ + return __extension__ (__m128i)(__v2di){ __q0, __q1 }; +} + +/// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with +/// the specified 64-bit integer values. +/// +/// \headerfile +/// +/// This intrinsic is a utility function and does not correspond to a specific +/// instruction. +/// +/// \param __q1 +/// A 64-bit integer value used to initialize the upper 64 bits of the +/// destination vector of [2 x i64]. +/// \param __q0 +/// A 64-bit integer value used to initialize the lower 64 bits of the +/// destination vector of [2 x i64]. +/// \returns An initialized 128-bit vector of [2 x i64] containing the values +/// provided in the operands. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_set_epi64(__m64 __q1, __m64 __q0) +{ + return _mm_set_epi64x((long long)__q1, (long long)__q0); +} + +/// Initializes the 32-bit values in a 128-bit vector of [4 x i32] with +/// the specified 32-bit integer values. +/// +/// \headerfile +/// +/// This intrinsic is a utility function and does not correspond to a specific +/// instruction. +/// +/// \param __i3 +/// A 32-bit integer value used to initialize bits [127:96] of the +/// destination vector. +/// \param __i2 +/// A 32-bit integer value used to initialize bits [95:64] of the destination +/// vector. +/// \param __i1 +/// A 32-bit integer value used to initialize bits [63:32] of the destination +/// vector. +/// \param __i0 +/// A 32-bit integer value used to initialize bits [31:0] of the destination +/// vector. +/// \returns An initialized 128-bit vector of [4 x i32] containing the values +/// provided in the operands. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_set_epi32(int __i3, int __i2, int __i1, int __i0) +{ + return __extension__ (__m128i)(__v4si){ __i0, __i1, __i2, __i3}; +} + +/// Initializes the 16-bit values in a 128-bit vector of [8 x i16] with +/// the specified 16-bit integer values. +/// +/// \headerfile +/// +/// This intrinsic is a utility function and does not correspond to a specific +/// instruction. +/// +/// \param __w7 +/// A 16-bit integer value used to initialize bits [127:112] of the +/// destination vector. +/// \param __w6 +/// A 16-bit integer value used to initialize bits [111:96] of the +/// destination vector. +/// \param __w5 +/// A 16-bit integer value used to initialize bits [95:80] of the destination +/// vector. +/// \param __w4 +/// A 16-bit integer value used to initialize bits [79:64] of the destination +/// vector. +/// \param __w3 +/// A 16-bit integer value used to initialize bits [63:48] of the destination +/// vector. +/// \param __w2 +/// A 16-bit integer value used to initialize bits [47:32] of the destination +/// vector. +/// \param __w1 +/// A 16-bit integer value used to initialize bits [31:16] of the destination +/// vector. +/// \param __w0 +/// A 16-bit integer value used to initialize bits [15:0] of the destination +/// vector. +/// \returns An initialized 128-bit vector of [8 x i16] containing the values +/// provided in the operands. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0) +{ + return __extension__ (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 }; +} + +/// Initializes the 8-bit values in a 128-bit vector of [16 x i8] with +/// the specified 8-bit integer values. +/// +/// \headerfile +/// +/// This intrinsic is a utility function and does not correspond to a specific +/// instruction. +/// +/// \param __b15 +/// Initializes bits [127:120] of the destination vector. +/// \param __b14 +/// Initializes bits [119:112] of the destination vector. +/// \param __b13 +/// Initializes bits [111:104] of the destination vector. +/// \param __b12 +/// Initializes bits [103:96] of the destination vector. +/// \param __b11 +/// Initializes bits [95:88] of the destination vector. +/// \param __b10 +/// Initializes bits [87:80] of the destination vector. +/// \param __b9 +/// Initializes bits [79:72] of the destination vector. +/// \param __b8 +/// Initializes bits [71:64] of the destination vector. +/// \param __b7 +/// Initializes bits [63:56] of the destination vector. +/// \param __b6 +/// Initializes bits [55:48] of the destination vector. +/// \param __b5 +/// Initializes bits [47:40] of the destination vector. +/// \param __b4 +/// Initializes bits [39:32] of the destination vector. +/// \param __b3 +/// Initializes bits [31:24] of the destination vector. +/// \param __b2 +/// Initializes bits [23:16] of the destination vector. +/// \param __b1 +/// Initializes bits [15:8] of the destination vector. +/// \param __b0 +/// Initializes bits [7:0] of the destination vector. +/// \returns An initialized 128-bit vector of [16 x i8] containing the values +/// provided in the operands. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0) +{ + return __extension__ (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 }; +} + +/// Initializes both values in a 128-bit integer vector with the +/// specified 64-bit integer value. +/// +/// \headerfile +/// +/// This intrinsic is a utility function and does not correspond to a specific +/// instruction. +/// +/// \param __q +/// Integer value used to initialize the elements of the destination integer +/// vector. +/// \returns An initialized 128-bit integer vector of [2 x i64] with both +/// elements containing the value provided in the operand. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_set1_epi64x(long long __q) +{ + return _mm_set_epi64x(__q, __q); +} + +/// Initializes both values in a 128-bit vector of [2 x i64] with the +/// specified 64-bit value. +/// +/// \headerfile +/// +/// This intrinsic is a utility function and does not correspond to a specific +/// instruction. +/// +/// \param __q +/// A 64-bit value used to initialize the elements of the destination integer +/// vector. +/// \returns An initialized 128-bit vector of [2 x i64] with all elements +/// containing the value provided in the operand. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_set1_epi64(__m64 __q) +{ + return _mm_set_epi64(__q, __q); +} + +/// Initializes all values in a 128-bit vector of [4 x i32] with the +/// specified 32-bit value. +/// +/// \headerfile +/// +/// This intrinsic is a utility function and does not correspond to a specific +/// instruction. +/// +/// \param __i +/// A 32-bit value used to initialize the elements of the destination integer +/// vector. +/// \returns An initialized 128-bit vector of [4 x i32] with all elements +/// containing the value provided in the operand. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_set1_epi32(int __i) +{ + return _mm_set_epi32(__i, __i, __i, __i); +} + +/// Initializes all values in a 128-bit vector of [8 x i16] with the +/// specified 16-bit value. +/// +/// \headerfile +/// +/// This intrinsic is a utility function and does not correspond to a specific +/// instruction. +/// +/// \param __w +/// A 16-bit value used to initialize the elements of the destination integer +/// vector. +/// \returns An initialized 128-bit vector of [8 x i16] with all elements +/// containing the value provided in the operand. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_set1_epi16(short __w) +{ + return _mm_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w); +} + +/// Initializes all values in a 128-bit vector of [16 x i8] with the +/// specified 8-bit value. +/// +/// \headerfile +/// +/// This intrinsic is a utility function and does not correspond to a specific +/// instruction. +/// +/// \param __b +/// An 8-bit value used to initialize the elements of the destination integer +/// vector. +/// \returns An initialized 128-bit vector of [16 x i8] with all elements +/// containing the value provided in the operand. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_set1_epi8(char __b) +{ + return _mm_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b); +} + +/// Constructs a 128-bit integer vector, initialized in reverse order +/// with the specified 64-bit integral values. +/// +/// \headerfile +/// +/// This intrinsic does not correspond to a specific instruction. +/// +/// \param __q0 +/// A 64-bit integral value used to initialize the lower 64 bits of the +/// result. +/// \param __q1 +/// A 64-bit integral value used to initialize the upper 64 bits of the +/// result. +/// \returns An initialized 128-bit integer vector. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_setr_epi64(__m64 __q0, __m64 __q1) +{ + return _mm_set_epi64(__q1, __q0); +} + +/// Constructs a 128-bit integer vector, initialized in reverse order +/// with the specified 32-bit integral values. +/// +/// \headerfile +/// +/// This intrinsic is a utility function and does not correspond to a specific +/// instruction. +/// +/// \param __i0 +/// A 32-bit integral value used to initialize bits [31:0] of the result. +/// \param __i1 +/// A 32-bit integral value used to initialize bits [63:32] of the result. +/// \param __i2 +/// A 32-bit integral value used to initialize bits [95:64] of the result. +/// \param __i3 +/// A 32-bit integral value used to initialize bits [127:96] of the result. +/// \returns An initialized 128-bit integer vector. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_setr_epi32(int __i0, int __i1, int __i2, int __i3) +{ + return _mm_set_epi32(__i3, __i2, __i1, __i0); +} + +/// Constructs a 128-bit integer vector, initialized in reverse order +/// with the specified 16-bit integral values. +/// +/// \headerfile +/// +/// This intrinsic is a utility function and does not correspond to a specific +/// instruction. +/// +/// \param __w0 +/// A 16-bit integral value used to initialize bits [15:0] of the result. +/// \param __w1 +/// A 16-bit integral value used to initialize bits [31:16] of the result. +/// \param __w2 +/// A 16-bit integral value used to initialize bits [47:32] of the result. +/// \param __w3 +/// A 16-bit integral value used to initialize bits [63:48] of the result. +/// \param __w4 +/// A 16-bit integral value used to initialize bits [79:64] of the result. +/// \param __w5 +/// A 16-bit integral value used to initialize bits [95:80] of the result. +/// \param __w6 +/// A 16-bit integral value used to initialize bits [111:96] of the result. +/// \param __w7 +/// A 16-bit integral value used to initialize bits [127:112] of the result. +/// \returns An initialized 128-bit integer vector. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7) +{ + return _mm_set_epi16(__w7, __w6, __w5, __w4, __w3, __w2, __w1, __w0); +} + +/// Constructs a 128-bit integer vector, initialized in reverse order +/// with the specified 8-bit integral values. +/// +/// \headerfile +/// +/// This intrinsic is a utility function and does not correspond to a specific +/// instruction. +/// +/// \param __b0 +/// An 8-bit integral value used to initialize bits [7:0] of the result. +/// \param __b1 +/// An 8-bit integral value used to initialize bits [15:8] of the result. +/// \param __b2 +/// An 8-bit integral value used to initialize bits [23:16] of the result. +/// \param __b3 +/// An 8-bit integral value used to initialize bits [31:24] of the result. +/// \param __b4 +/// An 8-bit integral value used to initialize bits [39:32] of the result. +/// \param __b5 +/// An 8-bit integral value used to initialize bits [47:40] of the result. +/// \param __b6 +/// An 8-bit integral value used to initialize bits [55:48] of the result. +/// \param __b7 +/// An 8-bit integral value used to initialize bits [63:56] of the result. +/// \param __b8 +/// An 8-bit integral value used to initialize bits [71:64] of the result. +/// \param __b9 +/// An 8-bit integral value used to initialize bits [79:72] of the result. +/// \param __b10 +/// An 8-bit integral value used to initialize bits [87:80] of the result. +/// \param __b11 +/// An 8-bit integral value used to initialize bits [95:88] of the result. +/// \param __b12 +/// An 8-bit integral value used to initialize bits [103:96] of the result. +/// \param __b13 +/// An 8-bit integral value used to initialize bits [111:104] of the result. +/// \param __b14 +/// An 8-bit integral value used to initialize bits [119:112] of the result. +/// \param __b15 +/// An 8-bit integral value used to initialize bits [127:120] of the result. +/// \returns An initialized 128-bit integer vector. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15) +{ + return _mm_set_epi8(__b15, __b14, __b13, __b12, __b11, __b10, __b9, __b8, __b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0); +} + +/// Creates a 128-bit integer vector initialized to zero. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VXORPS / XORPS instruction. +/// +/// \returns An initialized 128-bit integer vector with all elements set to +/// zero. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_setzero_si128(void) +{ + return __extension__ (__m128i)(__v2di){ 0LL, 0LL }; +} + +/// Stores a 128-bit integer vector to a memory location aligned on a +/// 128-bit boundary. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVAPS / MOVAPS instruction. +/// +/// \param __p +/// A pointer to an aligned memory location that will receive the integer +/// values. +/// \param __b +/// A 128-bit integer vector containing the values to be moved. +static __inline__ void __DEFAULT_FN_ATTRS +_mm_store_si128(__m128i *__p, __m128i __b) +{ + *__p = __b; +} + +/// Stores a 128-bit integer vector to an unaligned memory location. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVUPS / MOVUPS instruction. +/// +/// \param __p +/// A pointer to a memory location that will receive the integer values. +/// \param __b +/// A 128-bit integer vector containing the values to be moved. +static __inline__ void __DEFAULT_FN_ATTRS +_mm_storeu_si128(__m128i_u *__p, __m128i __b) +{ + struct __storeu_si128 { + __m128i_u __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_si128*)__p)->__v = __b; +} + +/// Stores a 64-bit integer value from the low element of a 128-bit integer +/// vector. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVQ / MOVQ instruction. +/// +/// \param __p +/// A pointer to a 64-bit memory location. The address of the memory +/// location does not have to be aligned. +/// \param __b +/// A 128-bit integer vector containing the value to be stored. +static __inline__ void __DEFAULT_FN_ATTRS +_mm_storeu_si64(void *__p, __m128i __b) +{ + struct __storeu_si64 { + long long __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_si64*)__p)->__v = ((__v2di)__b)[0]; +} + +/// Stores a 32-bit integer value from the low element of a 128-bit integer +/// vector. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVD / MOVD instruction. +/// +/// \param __p +/// A pointer to a 32-bit memory location. The address of the memory +/// location does not have to be aligned. +/// \param __b +/// A 128-bit integer vector containing the value to be stored. +static __inline__ void __DEFAULT_FN_ATTRS +_mm_storeu_si32(void *__p, __m128i __b) +{ + struct __storeu_si32 { + int __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_si32*)__p)->__v = ((__v4si)__b)[0]; +} + +/// Stores a 16-bit integer value from the low element of a 128-bit integer +/// vector. +/// +/// \headerfile +/// +/// This intrinsic does not correspond to a specific instruction. +/// +/// \param __p +/// A pointer to a 16-bit memory location. The address of the memory +/// location does not have to be aligned. +/// \param __b +/// A 128-bit integer vector containing the value to be stored. +static __inline__ void __DEFAULT_FN_ATTRS +_mm_storeu_si16(void *__p, __m128i __b) +{ + struct __storeu_si16 { + short __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_si16*)__p)->__v = ((__v8hi)__b)[0]; +} + +/// Moves bytes selected by the mask from the first operand to the +/// specified unaligned memory location. When a mask bit is 1, the +/// corresponding byte is written, otherwise it is not written. +/// +/// To minimize caching, the data is flagged as non-temporal (unlikely to be +/// used again soon). Exception and trap behavior for elements not selected +/// for storage to memory are implementation dependent. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMASKMOVDQU / MASKMOVDQU +/// instruction. +/// +/// \param __d +/// A 128-bit integer vector containing the values to be moved. +/// \param __n +/// A 128-bit integer vector containing the mask. The most significant bit of +/// each byte represents the mask bits. +/// \param __p +/// A pointer to an unaligned 128-bit memory location where the specified +/// values are moved. +static __inline__ void __DEFAULT_FN_ATTRS +_mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p) +{ + __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p); +} + +/// Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to +/// a memory location. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVLPS / MOVLPS instruction. +/// +/// \param __p +/// A pointer to a 64-bit memory location that will receive the lower 64 bits +/// of the integer vector parameter. +/// \param __a +/// A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the +/// value to be stored. +static __inline__ void __DEFAULT_FN_ATTRS +_mm_storel_epi64(__m128i_u *__p, __m128i __a) +{ + struct __mm_storel_epi64_struct { + long long __u; + } __attribute__((__packed__, __may_alias__)); + ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0]; +} + +/// Stores a 128-bit floating point vector of [2 x double] to a 128-bit +/// aligned memory location. +/// +/// To minimize caching, the data is flagged as non-temporal (unlikely to be +/// used again soon). +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVNTPS / MOVNTPS instruction. +/// +/// \param __p +/// A pointer to the 128-bit aligned memory location used to store the value. +/// \param __a +/// A vector of [2 x double] containing the 64-bit values to be stored. +static __inline__ void __DEFAULT_FN_ATTRS +_mm_stream_pd(double *__p, __m128d __a) +{ + __builtin_nontemporal_store((__v2df)__a, (__v2df*)__p); +} + +/// Stores a 128-bit integer vector to a 128-bit aligned memory location. +/// +/// To minimize caching, the data is flagged as non-temporal (unlikely to be +/// used again soon). +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVNTPS / MOVNTPS instruction. +/// +/// \param __p +/// A pointer to the 128-bit aligned memory location used to store the value. +/// \param __a +/// A 128-bit integer vector containing the values to be stored. +static __inline__ void __DEFAULT_FN_ATTRS +_mm_stream_si128(__m128i *__p, __m128i __a) +{ + __builtin_nontemporal_store((__v2di)__a, (__v2di*)__p); +} + +/// Stores a 32-bit integer value in the specified memory location. +/// +/// To minimize caching, the data is flagged as non-temporal (unlikely to be +/// used again soon). +/// +/// \headerfile +/// +/// This intrinsic corresponds to the MOVNTI instruction. +/// +/// \param __p +/// A pointer to the 32-bit memory location used to store the value. +/// \param __a +/// A 32-bit integer containing the value to be stored. +static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("sse2"))) +_mm_stream_si32(int *__p, int __a) +{ + __builtin_ia32_movnti(__p, __a); +} + +#ifdef __x86_64__ +/// Stores a 64-bit integer value in the specified memory location. +/// +/// To minimize caching, the data is flagged as non-temporal (unlikely to be +/// used again soon). +/// +/// \headerfile +/// +/// This intrinsic corresponds to the MOVNTIQ instruction. +/// +/// \param __p +/// A pointer to the 64-bit memory location used to store the value. +/// \param __a +/// A 64-bit integer containing the value to be stored. +static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("sse2"))) +_mm_stream_si64(long long *__p, long long __a) +{ + __builtin_ia32_movnti64(__p, __a); +} +#endif + +#if defined(__cplusplus) +extern "C" { +#endif + +/// The cache line containing \a __p is flushed and invalidated from all +/// caches in the coherency domain. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the CLFLUSH instruction. +/// +/// \param __p +/// A pointer to the memory location used to identify the cache line to be +/// flushed. +void _mm_clflush(void const * __p); + +/// Forces strong memory ordering (serialization) between load +/// instructions preceding this instruction and load instructions following +/// this instruction, ensuring the system completes all previous loads before +/// executing subsequent loads. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the LFENCE instruction. +/// +void _mm_lfence(void); + +/// Forces strong memory ordering (serialization) between load and store +/// instructions preceding this instruction and load and store instructions +/// following this instruction, ensuring that the system completes all +/// previous memory accesses before executing subsequent memory accesses. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the MFENCE instruction. +/// +void _mm_mfence(void); + +#if defined(__cplusplus) +} // extern "C" +#endif + +/// Converts 16-bit signed integers from both 128-bit integer vector +/// operands into 8-bit signed integers, and packs the results into the +/// destination. Positive values greater than 0x7F are saturated to 0x7F. +/// Negative values less than 0x80 are saturated to 0x80. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPACKSSWB / PACKSSWB instruction. +/// +/// \param __a +/// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as +/// a signed integer and is converted to a 8-bit signed integer with +/// saturation. Values greater than 0x7F are saturated to 0x7F. Values less +/// than 0x80 are saturated to 0x80. The converted [8 x i8] values are +/// written to the lower 64 bits of the result. +/// \param __b +/// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as +/// a signed integer and is converted to a 8-bit signed integer with +/// saturation. Values greater than 0x7F are saturated to 0x7F. Values less +/// than 0x80 are saturated to 0x80. The converted [8 x i8] values are +/// written to the higher 64 bits of the result. +/// \returns A 128-bit vector of [16 x i8] containing the converted values. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_packs_epi16(__m128i __a, __m128i __b) +{ + return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b); +} + +/// Converts 32-bit signed integers from both 128-bit integer vector +/// operands into 16-bit signed integers, and packs the results into the +/// destination. Positive values greater than 0x7FFF are saturated to 0x7FFF. +/// Negative values less than 0x8000 are saturated to 0x8000. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPACKSSDW / PACKSSDW instruction. +/// +/// \param __a +/// A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as +/// a signed integer and is converted to a 16-bit signed integer with +/// saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values +/// less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values +/// are written to the lower 64 bits of the result. +/// \param __b +/// A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as +/// a signed integer and is converted to a 16-bit signed integer with +/// saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values +/// less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values +/// are written to the higher 64 bits of the result. +/// \returns A 128-bit vector of [8 x i16] containing the converted values. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_packs_epi32(__m128i __a, __m128i __b) +{ + return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b); +} + +/// Converts 16-bit signed integers from both 128-bit integer vector +/// operands into 8-bit unsigned integers, and packs the results into the +/// destination. Values greater than 0xFF are saturated to 0xFF. Values less +/// than 0x00 are saturated to 0x00. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPACKUSWB / PACKUSWB instruction. +/// +/// \param __a +/// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as +/// a signed integer and is converted to an 8-bit unsigned integer with +/// saturation. Values greater than 0xFF are saturated to 0xFF. Values less +/// than 0x00 are saturated to 0x00. The converted [8 x i8] values are +/// written to the lower 64 bits of the result. +/// \param __b +/// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as +/// a signed integer and is converted to an 8-bit unsigned integer with +/// saturation. Values greater than 0xFF are saturated to 0xFF. Values less +/// than 0x00 are saturated to 0x00. The converted [8 x i8] values are +/// written to the higher 64 bits of the result. +/// \returns A 128-bit vector of [16 x i8] containing the converted values. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_packus_epi16(__m128i __a, __m128i __b) +{ + return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b); +} + +/// Extracts 16 bits from a 128-bit integer vector of [8 x i16], using +/// the immediate-value parameter as a selector. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPEXTRW / PEXTRW instruction. +/// +/// \param __a +/// A 128-bit integer vector. +/// \param __imm +/// An immediate value. Bits [2:0] selects values from \a __a to be assigned +/// to bits[15:0] of the result. \n +/// 000: assign values from bits [15:0] of \a __a. \n +/// 001: assign values from bits [31:16] of \a __a. \n +/// 010: assign values from bits [47:32] of \a __a. \n +/// 011: assign values from bits [63:48] of \a __a. \n +/// 100: assign values from bits [79:64] of \a __a. \n +/// 101: assign values from bits [95:80] of \a __a. \n +/// 110: assign values from bits [111:96] of \a __a. \n +/// 111: assign values from bits [127:112] of \a __a. +/// \returns An integer, whose lower 16 bits are selected from the 128-bit +/// integer vector parameter and the remaining bits are assigned zeros. +#define _mm_extract_epi16(a, imm) \ + ((int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \ + (int)(imm))) + +/// Constructs a 128-bit integer vector by first making a copy of the +/// 128-bit integer vector parameter, and then inserting the lower 16 bits +/// of an integer parameter into an offset specified by the immediate-value +/// parameter. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPINSRW / PINSRW instruction. +/// +/// \param __a +/// A 128-bit integer vector of [8 x i16]. This vector is copied to the +/// result and then one of the eight elements in the result is replaced by +/// the lower 16 bits of \a __b. +/// \param __b +/// An integer. The lower 16 bits of this parameter are written to the +/// result beginning at an offset specified by \a __imm. +/// \param __imm +/// An immediate value specifying the bit offset in the result at which the +/// lower 16 bits of \a __b are written. +/// \returns A 128-bit integer vector containing the constructed values. +#define _mm_insert_epi16(a, b, imm) \ + ((__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b), \ + (int)(imm))) + +/// Copies the values of the most significant bits from each 8-bit +/// element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask +/// value, zero-extends the value, and writes it to the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPMOVMSKB / PMOVMSKB instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the values with bits to be extracted. +/// \returns The most significant bits from each 8-bit element in \a __a, +/// written to bits [15:0]. The other bits are assigned zeros. +static __inline__ int __DEFAULT_FN_ATTRS +_mm_movemask_epi8(__m128i __a) +{ + return __builtin_ia32_pmovmskb128((__v16qi)__a); +} + +/// Constructs a 128-bit integer vector by shuffling four 32-bit +/// elements of a 128-bit integer vector parameter, using the immediate-value +/// parameter as a specifier. +/// +/// \headerfile +/// +/// \code +/// __m128i _mm_shuffle_epi32(__m128i a, const int imm); +/// \endcode +/// +/// This intrinsic corresponds to the VPSHUFD / PSHUFD instruction. +/// +/// \param a +/// A 128-bit integer vector containing the values to be copied. +/// \param imm +/// An immediate value containing an 8-bit value specifying which elements to +/// copy from a. The destinations within the 128-bit destination are assigned +/// values as follows: \n +/// Bits [1:0] are used to assign values to bits [31:0] of the result. \n +/// Bits [3:2] are used to assign values to bits [63:32] of the result. \n +/// Bits [5:4] are used to assign values to bits [95:64] of the result. \n +/// Bits [7:6] are used to assign values to bits [127:96] of the result. \n +/// Bit value assignments: \n +/// 00: assign values from bits [31:0] of \a a. \n +/// 01: assign values from bits [63:32] of \a a. \n +/// 10: assign values from bits [95:64] of \a a. \n +/// 11: assign values from bits [127:96] of \a a. +/// \returns A 128-bit integer vector containing the shuffled values. +#define _mm_shuffle_epi32(a, imm) \ + ((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm))) + +/// Constructs a 128-bit integer vector by shuffling four lower 16-bit +/// elements of a 128-bit integer vector of [8 x i16], using the immediate +/// value parameter as a specifier. +/// +/// \headerfile +/// +/// \code +/// __m128i _mm_shufflelo_epi16(__m128i a, const int imm); +/// \endcode +/// +/// This intrinsic corresponds to the VPSHUFLW / PSHUFLW instruction. +/// +/// \param a +/// A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to bits +/// [127:64] of the result. +/// \param imm +/// An 8-bit immediate value specifying which elements to copy from \a a. \n +/// Bits[1:0] are used to assign values to bits [15:0] of the result. \n +/// Bits[3:2] are used to assign values to bits [31:16] of the result. \n +/// Bits[5:4] are used to assign values to bits [47:32] of the result. \n +/// Bits[7:6] are used to assign values to bits [63:48] of the result. \n +/// Bit value assignments: \n +/// 00: assign values from bits [15:0] of \a a. \n +/// 01: assign values from bits [31:16] of \a a. \n +/// 10: assign values from bits [47:32] of \a a. \n +/// 11: assign values from bits [63:48] of \a a. \n +/// \returns A 128-bit integer vector containing the shuffled values. +#define _mm_shufflelo_epi16(a, imm) \ + ((__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm))) + +/// Constructs a 128-bit integer vector by shuffling four upper 16-bit +/// elements of a 128-bit integer vector of [8 x i16], using the immediate +/// value parameter as a specifier. +/// +/// \headerfile +/// +/// \code +/// __m128i _mm_shufflehi_epi16(__m128i a, const int imm); +/// \endcode +/// +/// This intrinsic corresponds to the VPSHUFHW / PSHUFHW instruction. +/// +/// \param a +/// A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to bits +/// [63:0] of the result. +/// \param imm +/// An 8-bit immediate value specifying which elements to copy from \a a. \n +/// Bits[1:0] are used to assign values to bits [79:64] of the result. \n +/// Bits[3:2] are used to assign values to bits [95:80] of the result. \n +/// Bits[5:4] are used to assign values to bits [111:96] of the result. \n +/// Bits[7:6] are used to assign values to bits [127:112] of the result. \n +/// Bit value assignments: \n +/// 00: assign values from bits [79:64] of \a a. \n +/// 01: assign values from bits [95:80] of \a a. \n +/// 10: assign values from bits [111:96] of \a a. \n +/// 11: assign values from bits [127:112] of \a a. \n +/// \returns A 128-bit integer vector containing the shuffled values. +#define _mm_shufflehi_epi16(a, imm) \ + ((__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm))) + +/// Unpacks the high-order (index 8-15) values from two 128-bit vectors +/// of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPUNPCKHBW / PUNPCKHBW +/// instruction. +/// +/// \param __a +/// A 128-bit vector of [16 x i8]. +/// Bits [71:64] are written to bits [7:0] of the result. \n +/// Bits [79:72] are written to bits [23:16] of the result. \n +/// Bits [87:80] are written to bits [39:32] of the result. \n +/// Bits [95:88] are written to bits [55:48] of the result. \n +/// Bits [103:96] are written to bits [71:64] of the result. \n +/// Bits [111:104] are written to bits [87:80] of the result. \n +/// Bits [119:112] are written to bits [103:96] of the result. \n +/// Bits [127:120] are written to bits [119:112] of the result. +/// \param __b +/// A 128-bit vector of [16 x i8]. \n +/// Bits [71:64] are written to bits [15:8] of the result. \n +/// Bits [79:72] are written to bits [31:24] of the result. \n +/// Bits [87:80] are written to bits [47:40] of the result. \n +/// Bits [95:88] are written to bits [63:56] of the result. \n +/// Bits [103:96] are written to bits [79:72] of the result. \n +/// Bits [111:104] are written to bits [95:88] of the result. \n +/// Bits [119:112] are written to bits [111:104] of the result. \n +/// Bits [127:120] are written to bits [127:120] of the result. +/// \returns A 128-bit vector of [16 x i8] containing the interleaved values. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_unpackhi_epi8(__m128i __a, __m128i __b) +{ + return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15); +} + +/// Unpacks the high-order (index 4-7) values from two 128-bit vectors of +/// [8 x i16] and interleaves them into a 128-bit vector of [8 x i16]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPUNPCKHWD / PUNPCKHWD +/// instruction. +/// +/// \param __a +/// A 128-bit vector of [8 x i16]. +/// Bits [79:64] are written to bits [15:0] of the result. \n +/// Bits [95:80] are written to bits [47:32] of the result. \n +/// Bits [111:96] are written to bits [79:64] of the result. \n +/// Bits [127:112] are written to bits [111:96] of the result. +/// \param __b +/// A 128-bit vector of [8 x i16]. +/// Bits [79:64] are written to bits [31:16] of the result. \n +/// Bits [95:80] are written to bits [63:48] of the result. \n +/// Bits [111:96] are written to bits [95:80] of the result. \n +/// Bits [127:112] are written to bits [127:112] of the result. +/// \returns A 128-bit vector of [8 x i16] containing the interleaved values. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_unpackhi_epi16(__m128i __a, __m128i __b) +{ + return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7); +} + +/// Unpacks the high-order (index 2,3) values from two 128-bit vectors of +/// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPUNPCKHDQ / PUNPCKHDQ +/// instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x i32]. \n +/// Bits [95:64] are written to bits [31:0] of the destination. \n +/// Bits [127:96] are written to bits [95:64] of the destination. +/// \param __b +/// A 128-bit vector of [4 x i32]. \n +/// Bits [95:64] are written to bits [64:32] of the destination. \n +/// Bits [127:96] are written to bits [127:96] of the destination. +/// \returns A 128-bit vector of [4 x i32] containing the interleaved values. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_unpackhi_epi32(__m128i __a, __m128i __b) +{ + return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3); +} + +/// Unpacks the high-order 64-bit elements from two 128-bit vectors of +/// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPUNPCKHQDQ / PUNPCKHQDQ +/// instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x i64]. \n +/// Bits [127:64] are written to bits [63:0] of the destination. +/// \param __b +/// A 128-bit vector of [2 x i64]. \n +/// Bits [127:64] are written to bits [127:64] of the destination. +/// \returns A 128-bit vector of [2 x i64] containing the interleaved values. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_unpackhi_epi64(__m128i __a, __m128i __b) +{ + return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2+1); +} + +/// Unpacks the low-order (index 0-7) values from two 128-bit vectors of +/// [16 x i8] and interleaves them into a 128-bit vector of [16 x i8]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPUNPCKLBW / PUNPCKLBW +/// instruction. +/// +/// \param __a +/// A 128-bit vector of [16 x i8]. \n +/// Bits [7:0] are written to bits [7:0] of the result. \n +/// Bits [15:8] are written to bits [23:16] of the result. \n +/// Bits [23:16] are written to bits [39:32] of the result. \n +/// Bits [31:24] are written to bits [55:48] of the result. \n +/// Bits [39:32] are written to bits [71:64] of the result. \n +/// Bits [47:40] are written to bits [87:80] of the result. \n +/// Bits [55:48] are written to bits [103:96] of the result. \n +/// Bits [63:56] are written to bits [119:112] of the result. +/// \param __b +/// A 128-bit vector of [16 x i8]. +/// Bits [7:0] are written to bits [15:8] of the result. \n +/// Bits [15:8] are written to bits [31:24] of the result. \n +/// Bits [23:16] are written to bits [47:40] of the result. \n +/// Bits [31:24] are written to bits [63:56] of the result. \n +/// Bits [39:32] are written to bits [79:72] of the result. \n +/// Bits [47:40] are written to bits [95:88] of the result. \n +/// Bits [55:48] are written to bits [111:104] of the result. \n +/// Bits [63:56] are written to bits [127:120] of the result. +/// \returns A 128-bit vector of [16 x i8] containing the interleaved values. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_unpacklo_epi8(__m128i __a, __m128i __b) +{ + return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7); +} + +/// Unpacks the low-order (index 0-3) values from each of the two 128-bit +/// vectors of [8 x i16] and interleaves them into a 128-bit vector of +/// [8 x i16]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPUNPCKLWD / PUNPCKLWD +/// instruction. +/// +/// \param __a +/// A 128-bit vector of [8 x i16]. +/// Bits [15:0] are written to bits [15:0] of the result. \n +/// Bits [31:16] are written to bits [47:32] of the result. \n +/// Bits [47:32] are written to bits [79:64] of the result. \n +/// Bits [63:48] are written to bits [111:96] of the result. +/// \param __b +/// A 128-bit vector of [8 x i16]. +/// Bits [15:0] are written to bits [31:16] of the result. \n +/// Bits [31:16] are written to bits [63:48] of the result. \n +/// Bits [47:32] are written to bits [95:80] of the result. \n +/// Bits [63:48] are written to bits [127:112] of the result. +/// \returns A 128-bit vector of [8 x i16] containing the interleaved values. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_unpacklo_epi16(__m128i __a, __m128i __b) +{ + return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3); +} + +/// Unpacks the low-order (index 0,1) values from two 128-bit vectors of +/// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPUNPCKLDQ / PUNPCKLDQ +/// instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x i32]. \n +/// Bits [31:0] are written to bits [31:0] of the destination. \n +/// Bits [63:32] are written to bits [95:64] of the destination. +/// \param __b +/// A 128-bit vector of [4 x i32]. \n +/// Bits [31:0] are written to bits [64:32] of the destination. \n +/// Bits [63:32] are written to bits [127:96] of the destination. +/// \returns A 128-bit vector of [4 x i32] containing the interleaved values. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_unpacklo_epi32(__m128i __a, __m128i __b) +{ + return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1); +} + +/// Unpacks the low-order 64-bit elements from two 128-bit vectors of +/// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPUNPCKLQDQ / PUNPCKLQDQ +/// instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x i64]. \n +/// Bits [63:0] are written to bits [63:0] of the destination. \n +/// \param __b +/// A 128-bit vector of [2 x i64]. \n +/// Bits [63:0] are written to bits [127:64] of the destination. \n +/// \returns A 128-bit vector of [2 x i64] containing the interleaved values. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_unpacklo_epi64(__m128i __a, __m128i __b) +{ + return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2+0); +} + +/// Returns the lower 64 bits of a 128-bit integer vector as a 64-bit +/// integer. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the MOVDQ2Q instruction. +/// +/// \param __a +/// A 128-bit integer vector operand. The lower 64 bits are moved to the +/// destination. +/// \returns A 64-bit integer containing the lower 64 bits of the parameter. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_movepi64_pi64(__m128i __a) +{ + return (__m64)__a[0]; +} + +/// Moves the 64-bit operand to a 128-bit integer vector, zeroing the +/// upper bits. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the MOVD+VMOVQ instruction. +/// +/// \param __a +/// A 64-bit value. +/// \returns A 128-bit integer vector. The lower 64 bits contain the value from +/// the operand. The upper 64 bits are assigned zeros. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_movpi64_epi64(__m64 __a) +{ + return __extension__ (__m128i)(__v2di){ (long long)__a, 0 }; +} + +/// Moves the lower 64 bits of a 128-bit integer vector to a 128-bit +/// integer vector, zeroing the upper bits. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVQ / MOVQ instruction. +/// +/// \param __a +/// A 128-bit integer vector operand. The lower 64 bits are moved to the +/// destination. +/// \returns A 128-bit integer vector. The lower 64 bits contain the value from +/// the operand. The upper 64 bits are assigned zeros. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_move_epi64(__m128i __a) +{ + return __builtin_shufflevector((__v2di)__a, _mm_setzero_si128(), 0, 2); +} + +/// Unpacks the high-order 64-bit elements from two 128-bit vectors of +/// [2 x double] and interleaves them into a 128-bit vector of [2 x +/// double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VUNPCKHPD / UNPCKHPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. \n +/// Bits [127:64] are written to bits [63:0] of the destination. +/// \param __b +/// A 128-bit vector of [2 x double]. \n +/// Bits [127:64] are written to bits [127:64] of the destination. +/// \returns A 128-bit vector of [2 x double] containing the interleaved values. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_unpackhi_pd(__m128d __a, __m128d __b) +{ + return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2+1); +} + +/// Unpacks the low-order 64-bit elements from two 128-bit vectors +/// of [2 x double] and interleaves them into a 128-bit vector of [2 x +/// double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VUNPCKLPD / UNPCKLPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. \n +/// Bits [63:0] are written to bits [63:0] of the destination. +/// \param __b +/// A 128-bit vector of [2 x double]. \n +/// Bits [63:0] are written to bits [127:64] of the destination. +/// \returns A 128-bit vector of [2 x double] containing the interleaved values. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_unpacklo_pd(__m128d __a, __m128d __b) +{ + return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2+0); +} + +/// Extracts the sign bits of the double-precision values in the 128-bit +/// vector of [2 x double], zero-extends the value, and writes it to the +/// low-order bits of the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVMSKPD / MOVMSKPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing the values with sign bits to +/// be extracted. +/// \returns The sign bits from each of the double-precision elements in \a __a, +/// written to bits [1:0]. The remaining bits are assigned values of zero. +static __inline__ int __DEFAULT_FN_ATTRS +_mm_movemask_pd(__m128d __a) +{ + return __builtin_ia32_movmskpd((__v2df)__a); +} + + +/// Constructs a 128-bit floating-point vector of [2 x double] from two +/// 128-bit vector parameters of [2 x double], using the immediate-value +/// parameter as a specifier. +/// +/// \headerfile +/// +/// \code +/// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i); +/// \endcode +/// +/// This intrinsic corresponds to the VSHUFPD / SHUFPD instruction. +/// +/// \param a +/// A 128-bit vector of [2 x double]. +/// \param b +/// A 128-bit vector of [2 x double]. +/// \param i +/// An 8-bit immediate value. The least significant two bits specify which +/// elements to copy from \a a and \a b: \n +/// Bit[0] = 0: lower element of \a a copied to lower element of result. \n +/// Bit[0] = 1: upper element of \a a copied to lower element of result. \n +/// Bit[1] = 0: lower element of \a b copied to upper element of result. \n +/// Bit[1] = 1: upper element of \a b copied to upper element of result. \n +/// \returns A 128-bit vector of [2 x double] containing the shuffled values. +#define _mm_shuffle_pd(a, b, i) \ + ((__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \ + (int)(i))) + +/// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit +/// floating-point vector of [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 128-bit floating-point vector of [2 x double]. +/// \returns A 128-bit floating-point vector of [4 x float] containing the same +/// bitwise pattern as the parameter. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_castpd_ps(__m128d __a) +{ + return (__m128)__a; +} + +/// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit +/// integer vector. +/// +/// \headerfile +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 128-bit floating-point vector of [2 x double]. +/// \returns A 128-bit integer vector containing the same bitwise pattern as the +/// parameter. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_castpd_si128(__m128d __a) +{ + return (__m128i)__a; +} + +/// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit +/// floating-point vector of [2 x double]. +/// +/// \headerfile +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 128-bit floating-point vector of [4 x float]. +/// \returns A 128-bit floating-point vector of [2 x double] containing the same +/// bitwise pattern as the parameter. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_castps_pd(__m128 __a) +{ + return (__m128d)__a; +} + +/// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit +/// integer vector. +/// +/// \headerfile +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 128-bit floating-point vector of [4 x float]. +/// \returns A 128-bit integer vector containing the same bitwise pattern as the +/// parameter. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_castps_si128(__m128 __a) +{ + return (__m128i)__a; +} + +/// Casts a 128-bit integer vector into a 128-bit floating-point vector +/// of [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 128-bit integer vector. +/// \returns A 128-bit floating-point vector of [4 x float] containing the same +/// bitwise pattern as the parameter. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_castsi128_ps(__m128i __a) +{ + return (__m128)__a; +} + +/// Casts a 128-bit integer vector into a 128-bit floating-point vector +/// of [2 x double]. +/// +/// \headerfile +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 128-bit integer vector. +/// \returns A 128-bit floating-point vector of [2 x double] containing the same +/// bitwise pattern as the parameter. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_castsi128_pd(__m128i __a) +{ + return (__m128d)__a; +} + +#if defined(__cplusplus) +extern "C" { +#endif + +/// Indicates that a spin loop is being executed for the purposes of +/// optimizing power consumption during the loop. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PAUSE instruction. +/// +void _mm_pause(void); + +#if defined(__cplusplus) +} // extern "C" +#endif +#undef __DEFAULT_FN_ATTRS +#undef __DEFAULT_FN_ATTRS_MMX + +#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y)) + +#define _MM_DENORMALS_ZERO_ON (0x0040U) +#define _MM_DENORMALS_ZERO_OFF (0x0000U) + +#define _MM_DENORMALS_ZERO_MASK (0x0040U) + +#define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK) +#define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x))) + +#endif /* __EMMINTRIN_H */ diff --git a/include-llvm/enqcmdintrin.h b/include-llvm/enqcmdintrin.h new file mode 100644 index 0000000..30af67f --- /dev/null +++ b/include-llvm/enqcmdintrin.h @@ -0,0 +1,63 @@ +/*===------------------ enqcmdintrin.h - enqcmd intrinsics -----------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __ENQCMDINTRIN_H +#define __ENQCMDINTRIN_H + +/* Define the default attributes for the functions in this file */ +#define _DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("enqcmd"))) + +/// Reads 64-byte command pointed by \a __src, formats 64-byte enqueue store +/// data, and performs 64-byte enqueue store to memory pointed by \a __dst. +/// This intrinsics may only be used in User mode. +/// +/// \headerfile +/// +/// This intrinsics corresponds to the ENQCMD instruction. +/// +/// \param __dst +/// Pointer to the destination of the enqueue store. +/// \param __src +/// Pointer to 64-byte command data. +/// \returns If the command data is successfully written to \a __dst then 0 is +/// returned. Otherwise 1 is returned. +static __inline__ int _DEFAULT_FN_ATTRS +_enqcmd (void *__dst, const void *__src) +{ + return __builtin_ia32_enqcmd(__dst, __src); +} + +/// Reads 64-byte command pointed by \a __src, formats 64-byte enqueue store +/// data, and performs 64-byte enqueue store to memory pointed by \a __dst +/// This intrinsic may only be used in Privileged mode. +/// +/// \headerfile +/// +/// This intrinsics corresponds to the ENQCMDS instruction. +/// +/// \param __dst +/// Pointer to the destination of the enqueue store. +/// \param __src +/// Pointer to 64-byte command data. +/// \returns If the command data is successfully written to \a __dst then 0 is +/// returned. Otherwise 1 is returned. +static __inline__ int _DEFAULT_FN_ATTRS +_enqcmds (void *__dst, const void *__src) +{ + return __builtin_ia32_enqcmds(__dst, __src); +} + +#undef _DEFAULT_FN_ATTRS + +#endif /* __ENQCMDINTRIN_H */ diff --git a/include-llvm/f16cintrin.h b/include-llvm/f16cintrin.h new file mode 100644 index 0000000..13905e6 --- /dev/null +++ b/include-llvm/f16cintrin.h @@ -0,0 +1,162 @@ +/*===---- f16cintrin.h - F16C intrinsics -----------------------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#if !defined __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __F16CINTRIN_H +#define __F16CINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS128 \ + __attribute__((__always_inline__, __nodebug__, __target__("f16c"), __min_vector_width__(128))) +#define __DEFAULT_FN_ATTRS256 \ + __attribute__((__always_inline__, __nodebug__, __target__("f16c"), __min_vector_width__(256))) + +/* NOTE: Intel documents the 128-bit versions of these as being in emmintrin.h, + * but that's because icc can emulate these without f16c using a library call. + * Since we don't do that let's leave these in f16cintrin.h. + */ + +/// Converts a 16-bit half-precision float value into a 32-bit float +/// value. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTPH2PS instruction. +/// +/// \param __a +/// A 16-bit half-precision float value. +/// \returns The converted 32-bit float value. +static __inline float __DEFAULT_FN_ATTRS128 +_cvtsh_ss(unsigned short __a) +{ + __v8hi __v = {(short)__a, 0, 0, 0, 0, 0, 0, 0}; + __v4sf __r = __builtin_ia32_vcvtph2ps(__v); + return __r[0]; +} + +/// Converts a 32-bit single-precision float value to a 16-bit +/// half-precision float value. +/// +/// \headerfile +/// +/// \code +/// unsigned short _cvtss_sh(float a, const int imm); +/// \endcode +/// +/// This intrinsic corresponds to the VCVTPS2PH instruction. +/// +/// \param a +/// A 32-bit single-precision float value to be converted to a 16-bit +/// half-precision float value. +/// \param imm +/// An immediate value controlling rounding using bits [2:0]: \n +/// 000: Nearest \n +/// 001: Down \n +/// 010: Up \n +/// 011: Truncate \n +/// 1XX: Use MXCSR.RC for rounding +/// \returns The converted 16-bit half-precision float value. +#define _cvtss_sh(a, imm) \ + ((unsigned short)(((__v8hi)__builtin_ia32_vcvtps2ph((__v4sf){a, 0, 0, 0}, \ + (imm)))[0])) + +/// Converts a 128-bit vector containing 32-bit float values into a +/// 128-bit vector containing 16-bit half-precision float values. +/// +/// \headerfile +/// +/// \code +/// __m128i _mm_cvtps_ph(__m128 a, const int imm); +/// \endcode +/// +/// This intrinsic corresponds to the VCVTPS2PH instruction. +/// +/// \param a +/// A 128-bit vector containing 32-bit float values. +/// \param imm +/// An immediate value controlling rounding using bits [2:0]: \n +/// 000: Nearest \n +/// 001: Down \n +/// 010: Up \n +/// 011: Truncate \n +/// 1XX: Use MXCSR.RC for rounding +/// \returns A 128-bit vector containing converted 16-bit half-precision float +/// values. The lower 64 bits are used to store the converted 16-bit +/// half-precision floating-point values. +#define _mm_cvtps_ph(a, imm) \ + ((__m128i)__builtin_ia32_vcvtps2ph((__v4sf)(__m128)(a), (imm))) + +/// Converts a 128-bit vector containing 16-bit half-precision float +/// values into a 128-bit vector containing 32-bit float values. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTPH2PS instruction. +/// +/// \param __a +/// A 128-bit vector containing 16-bit half-precision float values. The lower +/// 64 bits are used in the conversion. +/// \returns A 128-bit vector of [4 x float] containing converted float values. +static __inline __m128 __DEFAULT_FN_ATTRS128 +_mm_cvtph_ps(__m128i __a) +{ + return (__m128)__builtin_ia32_vcvtph2ps((__v8hi)__a); +} + +/// Converts a 256-bit vector of [8 x float] into a 128-bit vector +/// containing 16-bit half-precision float values. +/// +/// \headerfile +/// +/// \code +/// __m128i _mm256_cvtps_ph(__m256 a, const int imm); +/// \endcode +/// +/// This intrinsic corresponds to the VCVTPS2PH instruction. +/// +/// \param a +/// A 256-bit vector containing 32-bit single-precision float values to be +/// converted to 16-bit half-precision float values. +/// \param imm +/// An immediate value controlling rounding using bits [2:0]: \n +/// 000: Nearest \n +/// 001: Down \n +/// 010: Up \n +/// 011: Truncate \n +/// 1XX: Use MXCSR.RC for rounding +/// \returns A 128-bit vector containing the converted 16-bit half-precision +/// float values. +#define _mm256_cvtps_ph(a, imm) \ + ((__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)(__m256)(a), (imm))) + +/// Converts a 128-bit vector containing 16-bit half-precision float +/// values into a 256-bit vector of [8 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTPH2PS instruction. +/// +/// \param __a +/// A 128-bit vector containing 16-bit half-precision float values to be +/// converted to 32-bit single-precision float values. +/// \returns A vector of [8 x float] containing the converted 32-bit +/// single-precision float values. +static __inline __m256 __DEFAULT_FN_ATTRS256 +_mm256_cvtph_ps(__m128i __a) +{ + return (__m256)__builtin_ia32_vcvtph2ps256((__v8hi)__a); +} + +#undef __DEFAULT_FN_ATTRS128 +#undef __DEFAULT_FN_ATTRS256 + +#endif /* __F16CINTRIN_H */ diff --git a/include-llvm/fma4intrin.h b/include-llvm/fma4intrin.h new file mode 100644 index 0000000..694801b --- /dev/null +++ b/include-llvm/fma4intrin.h @@ -0,0 +1,218 @@ +/*===---- fma4intrin.h - FMA4 intrinsics -----------------------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __X86INTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __FMA4INTRIN_H +#define __FMA4INTRIN_H + +#include + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("fma4"), __min_vector_width__(128))) +#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("fma4"), __min_vector_width__(256))) + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_macc_ps(__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_macc_pd(__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_macc_ss(__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_macc_sd(__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_msub_ps(__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_msub_pd(__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_msub_ss(__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_msub_sd(__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, -(__v2df)__C); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_nmacc_ps(__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_nmacc_pd(__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_nmacc_ss(__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_nmacc_sd(__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, (__v2df)__C); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_nmsub_ps(__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_nmsub_pd(__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_nmsub_ss(__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_nmsub_sd(__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_maddsub_ps(__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_maddsub_pd(__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_msubadd_ps(__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_msubadd_pd(__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_macc_ps(__m256 __A, __m256 __B, __m256 __C) +{ + return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_macc_pd(__m256d __A, __m256d __B, __m256d __C) +{ + return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_msub_ps(__m256 __A, __m256 __B, __m256 __C) +{ + return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_msub_pd(__m256d __A, __m256d __B, __m256d __C) +{ + return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_nmacc_ps(__m256 __A, __m256 __B, __m256 __C) +{ + return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_nmacc_pd(__m256d __A, __m256d __B, __m256d __C) +{ + return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_nmsub_ps(__m256 __A, __m256 __B, __m256 __C) +{ + return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_nmsub_pd(__m256d __A, __m256d __B, __m256d __C) +{ + return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_maddsub_ps(__m256 __A, __m256 __B, __m256 __C) +{ + return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_maddsub_pd(__m256d __A, __m256d __B, __m256d __C) +{ + return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_msubadd_ps(__m256 __A, __m256 __B, __m256 __C) +{ + return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_msubadd_pd(__m256d __A, __m256d __B, __m256d __C) +{ + return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C); +} + +#undef __DEFAULT_FN_ATTRS128 +#undef __DEFAULT_FN_ATTRS256 + +#endif /* __FMA4INTRIN_H */ diff --git a/include-llvm/fmaintrin.h b/include-llvm/fmaintrin.h new file mode 100644 index 0000000..d889b7c --- /dev/null +++ b/include-llvm/fmaintrin.h @@ -0,0 +1,216 @@ +/*===---- fmaintrin.h - FMA intrinsics -------------------------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __FMAINTRIN_H +#define __FMAINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(128))) +#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(256))) + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, (__v2df)__C); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, -(__v2df)__C); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, (__v4sf)__C); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, (__v2df)__C); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, -(__v4sf)__C); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, -(__v2df)__C); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS128 +_mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C) +{ + return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS128 +_mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C) +{ + return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C) +{ + return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C) +{ + return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C) +{ + return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C) +{ + return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C) +{ + return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C) +{ + return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C) +{ + return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C) +{ + return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C) +{ + return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C) +{ + return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C) +{ + return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C) +{ + return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C); +} + +#undef __DEFAULT_FN_ATTRS128 +#undef __DEFAULT_FN_ATTRS256 + +#endif /* __FMAINTRIN_H */ diff --git a/include-llvm/fxsrintrin.h b/include-llvm/fxsrintrin.h new file mode 100644 index 0000000..afee6aa --- /dev/null +++ b/include-llvm/fxsrintrin.h @@ -0,0 +1,91 @@ +/*===---- fxsrintrin.h - FXSR intrinsic ------------------------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __FXSRINTRIN_H +#define __FXSRINTRIN_H + +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("fxsr"))) + +/// Saves the XMM, MMX, MXCSR and x87 FPU registers into a 512-byte +/// memory region pointed to by the input parameter \a __p. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the FXSAVE instruction. +/// +/// \param __p +/// A pointer to a 512-byte memory region. The beginning of this memory +/// region should be aligned on a 16-byte boundary. +static __inline__ void __DEFAULT_FN_ATTRS +_fxsave(void *__p) +{ + __builtin_ia32_fxsave(__p); +} + +/// Restores the XMM, MMX, MXCSR and x87 FPU registers from the 512-byte +/// memory region pointed to by the input parameter \a __p. The contents of +/// this memory region should have been written to by a previous \c _fxsave +/// or \c _fxsave64 intrinsic. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the FXRSTOR instruction. +/// +/// \param __p +/// A pointer to a 512-byte memory region. The beginning of this memory +/// region should be aligned on a 16-byte boundary. +static __inline__ void __DEFAULT_FN_ATTRS +_fxrstor(void *__p) +{ + __builtin_ia32_fxrstor(__p); +} + +#ifdef __x86_64__ +/// Saves the XMM, MMX, MXCSR and x87 FPU registers into a 512-byte +/// memory region pointed to by the input parameter \a __p. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the FXSAVE64 instruction. +/// +/// \param __p +/// A pointer to a 512-byte memory region. The beginning of this memory +/// region should be aligned on a 16-byte boundary. +static __inline__ void __DEFAULT_FN_ATTRS +_fxsave64(void *__p) +{ + __builtin_ia32_fxsave64(__p); +} + +/// Restores the XMM, MMX, MXCSR and x87 FPU registers from the 512-byte +/// memory region pointed to by the input parameter \a __p. The contents of +/// this memory region should have been written to by a previous \c _fxsave +/// or \c _fxsave64 intrinsic. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the FXRSTOR64 instruction. +/// +/// \param __p +/// A pointer to a 512-byte memory region. The beginning of this memory +/// region should be aligned on a 16-byte boundary. +static __inline__ void __DEFAULT_FN_ATTRS +_fxrstor64(void *__p) +{ + __builtin_ia32_fxrstor64(__p); +} +#endif + +#undef __DEFAULT_FN_ATTRS + +#endif diff --git a/include-llvm/gfniintrin.h b/include-llvm/gfniintrin.h new file mode 100644 index 0000000..a59238b --- /dev/null +++ b/include-llvm/gfniintrin.h @@ -0,0 +1,192 @@ +/*===----------------- gfniintrin.h - GFNI intrinsics ----------------------=== + * + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __GFNIINTRIN_H +#define __GFNIINTRIN_H + +/* Default attributes for simple form (no masking). */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("gfni"), __min_vector_width__(128))) + +/* Default attributes for YMM unmasked form. */ +#define __DEFAULT_FN_ATTRS_Y __attribute__((__always_inline__, __nodebug__, __target__("avx,gfni"), __min_vector_width__(256))) + +/* Default attributes for ZMM forms. */ +#define __DEFAULT_FN_ATTRS_Z __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,gfni"), __min_vector_width__(512))) + +/* Default attributes for VLX forms. */ +#define __DEFAULT_FN_ATTRS_VL128 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni"), __min_vector_width__(128))) +#define __DEFAULT_FN_ATTRS_VL256 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni"), __min_vector_width__(256))) + +#define _mm_gf2p8affineinv_epi64_epi8(A, B, I) \ + ((__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi((__v16qi)(__m128i)(A), \ + (__v16qi)(__m128i)(B), \ + (char)(I))) + +#define _mm_gf2p8affine_epi64_epi8(A, B, I) \ + ((__m128i)__builtin_ia32_vgf2p8affineqb_v16qi((__v16qi)(__m128i)(A), \ + (__v16qi)(__m128i)(B), \ + (char)(I))) + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_gf2p8mul_epi8(__m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_vgf2p8mulb_v16qi((__v16qi) __A, + (__v16qi) __B); +} + +#ifdef __AVXINTRIN_H +#define _mm256_gf2p8affineinv_epi64_epi8(A, B, I) \ + ((__m256i)__builtin_ia32_vgf2p8affineinvqb_v32qi((__v32qi)(__m256i)(A), \ + (__v32qi)(__m256i)(B), \ + (char)(I))) + +#define _mm256_gf2p8affine_epi64_epi8(A, B, I) \ + ((__m256i)__builtin_ia32_vgf2p8affineqb_v32qi((__v32qi)(__m256i)(A), \ + (__v32qi)(__m256i)(B), \ + (char)(I))) + +static __inline__ __m256i __DEFAULT_FN_ATTRS_Y +_mm256_gf2p8mul_epi8(__m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_vgf2p8mulb_v32qi((__v32qi) __A, + (__v32qi) __B); +} +#endif /* __AVXINTRIN_H */ + +#ifdef __AVX512BWINTRIN_H +#define _mm512_gf2p8affineinv_epi64_epi8(A, B, I) \ + ((__m512i)__builtin_ia32_vgf2p8affineinvqb_v64qi((__v64qi)(__m512i)(A), \ + (__v64qi)(__m512i)(B), \ + (char)(I))) + +#define _mm512_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \ + ((__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \ + (__v64qi)_mm512_gf2p8affineinv_epi64_epi8(A, B, I), \ + (__v64qi)(__m512i)(S))) + +#define _mm512_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \ + _mm512_mask_gf2p8affineinv_epi64_epi8((__m512i)_mm512_setzero_si512(), \ + U, A, B, I) + +#define _mm512_gf2p8affine_epi64_epi8(A, B, I) \ + ((__m512i)__builtin_ia32_vgf2p8affineqb_v64qi((__v64qi)(__m512i)(A), \ + (__v64qi)(__m512i)(B), \ + (char)(I))) + +#define _mm512_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \ + ((__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \ + (__v64qi)_mm512_gf2p8affine_epi64_epi8((A), (B), (I)), \ + (__v64qi)(__m512i)(S))) + +#define _mm512_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \ + _mm512_mask_gf2p8affine_epi64_epi8((__m512i)_mm512_setzero_si512(), \ + U, A, B, I) + +static __inline__ __m512i __DEFAULT_FN_ATTRS_Z +_mm512_gf2p8mul_epi8(__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_vgf2p8mulb_v64qi((__v64qi) __A, + (__v64qi) __B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS_Z +_mm512_mask_gf2p8mul_epi8(__m512i __S, __mmask64 __U, __m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_selectb_512(__U, + (__v64qi) _mm512_gf2p8mul_epi8(__A, __B), + (__v64qi) __S); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS_Z +_mm512_maskz_gf2p8mul_epi8(__mmask64 __U, __m512i __A, __m512i __B) +{ + return _mm512_mask_gf2p8mul_epi8((__m512i)_mm512_setzero_si512(), + __U, __A, __B); +} +#endif /* __AVX512BWINTRIN_H */ + +#ifdef __AVX512VLBWINTRIN_H +#define _mm_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \ + ((__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \ + (__v16qi)_mm_gf2p8affineinv_epi64_epi8(A, B, I), \ + (__v16qi)(__m128i)(S))) + +#define _mm_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \ + _mm_mask_gf2p8affineinv_epi64_epi8((__m128i)_mm_setzero_si128(), \ + U, A, B, I) + +#define _mm256_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \ + ((__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \ + (__v32qi)_mm256_gf2p8affineinv_epi64_epi8(A, B, I), \ + (__v32qi)(__m256i)(S))) + +#define _mm256_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \ + _mm256_mask_gf2p8affineinv_epi64_epi8((__m256i)_mm256_setzero_si256(), \ + U, A, B, I) + +#define _mm_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \ + ((__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \ + (__v16qi)_mm_gf2p8affine_epi64_epi8(A, B, I), \ + (__v16qi)(__m128i)(S))) + +#define _mm_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \ + _mm_mask_gf2p8affine_epi64_epi8((__m128i)_mm_setzero_si128(), U, A, B, I) + +#define _mm256_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \ + ((__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \ + (__v32qi)_mm256_gf2p8affine_epi64_epi8(A, B, I), \ + (__v32qi)(__m256i)(S))) + +#define _mm256_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \ + _mm256_mask_gf2p8affine_epi64_epi8((__m256i)_mm256_setzero_si256(), \ + U, A, B, I) + +static __inline__ __m128i __DEFAULT_FN_ATTRS_VL128 +_mm_mask_gf2p8mul_epi8(__m128i __S, __mmask16 __U, __m128i __A, __m128i __B) +{ + return (__m128i) __builtin_ia32_selectb_128(__U, + (__v16qi) _mm_gf2p8mul_epi8(__A, __B), + (__v16qi) __S); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS_VL128 +_mm_maskz_gf2p8mul_epi8(__mmask16 __U, __m128i __A, __m128i __B) +{ + return _mm_mask_gf2p8mul_epi8((__m128i)_mm_setzero_si128(), + __U, __A, __B); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS_VL256 +_mm256_mask_gf2p8mul_epi8(__m256i __S, __mmask32 __U, __m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_selectb_256(__U, + (__v32qi) _mm256_gf2p8mul_epi8(__A, __B), + (__v32qi) __S); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS_VL256 +_mm256_maskz_gf2p8mul_epi8(__mmask32 __U, __m256i __A, __m256i __B) +{ + return _mm256_mask_gf2p8mul_epi8((__m256i)_mm256_setzero_si256(), + __U, __A, __B); +} +#endif /* __AVX512VLBWINTRIN_H */ + +#undef __DEFAULT_FN_ATTRS +#undef __DEFAULT_FN_ATTRS_Y +#undef __DEFAULT_FN_ATTRS_Z +#undef __DEFAULT_FN_ATTRS_VL128 +#undef __DEFAULT_FN_ATTRS_VL256 + +#endif /* __GFNIINTRIN_H */ + diff --git a/include-llvm/hresetintrin.h b/include-llvm/hresetintrin.h new file mode 100644 index 0000000..13e31a2 --- /dev/null +++ b/include-llvm/hresetintrin.h @@ -0,0 +1,49 @@ +/*===---------------- hresetintrin.h - HRESET intrinsics -------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __X86GPRINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __HRESETINTRIN_H +#define __HRESETINTRIN_H + +#if __has_extension(gnu_asm) + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("hreset"))) + +/// Provides a hint to the processor to selectively reset the prediction +/// history of the current logical processor specified by a 32-bit integer +/// value \a __eax. +/// +/// This intrinsic corresponds to the HRESET instruction. +/// +/// \operation +/// IF __eax == 0 +/// // nop +/// ELSE +/// FOR i := 0 to 31 +/// IF __eax[i] +/// ResetPredictionFeature(i) +/// FI +/// ENDFOR +/// FI +/// \endoperation +static __inline void __DEFAULT_FN_ATTRS +_hreset(int __eax) +{ + __asm__ ("hreset $0" :: "a"(__eax)); +} + +#undef __DEFAULT_FN_ATTRS + +#endif /* __has_extension(gnu_asm) */ + +#endif /* __HRESETINTRIN_H */ diff --git a/include-llvm/ia32intrin.h b/include-llvm/ia32intrin.h new file mode 100644 index 0000000..ec8142b --- /dev/null +++ b/include-llvm/ia32intrin.h @@ -0,0 +1,441 @@ +/* ===-------- ia32intrin.h ---------------------------------------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __X86INTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __IA32INTRIN_H +#define __IA32INTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__)) +#define __DEFAULT_FN_ATTRS_CRC32 __attribute__((__always_inline__, __nodebug__, __target__("crc32"))) + +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS_CAST __attribute__((__always_inline__)) constexpr +#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr +#else +#define __DEFAULT_FN_ATTRS_CAST __attribute__((__always_inline__)) +#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS +#endif + +/** Find the first set bit starting from the lsb. Result is undefined if + * input is 0. + * + * \headerfile + * + * This intrinsic corresponds to the BSF instruction or the + * TZCNT instruction. + * + * \param __A + * A 32-bit integer operand. + * \returns A 32-bit integer containing the bit number. + */ +static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR +__bsfd(int __A) { + return __builtin_ctz(__A); +} + +/** Find the first set bit starting from the msb. Result is undefined if + * input is 0. + * + * \headerfile + * + * This intrinsic corresponds to the BSR instruction or the + * LZCNT instruction and an XOR . + * + * \param __A + * A 32-bit integer operand. + * \returns A 32-bit integer containing the bit number. + */ +static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR +__bsrd(int __A) { + return 31 - __builtin_clz(__A); +} + +/** Swaps the bytes in the input. Converting little endian to big endian or + * vice versa. + * + * \headerfile + * + * This intrinsic corresponds to the BSWAP instruction. + * + * \param __A + * A 32-bit integer operand. + * \returns A 32-bit integer containing the swapped bytes. + */ +static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR +__bswapd(int __A) { + return __builtin_bswap32(__A); +} + +static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR +_bswap(int __A) { + return __builtin_bswap32(__A); +} + +#define _bit_scan_forward(A) __bsfd((A)) +#define _bit_scan_reverse(A) __bsrd((A)) + +#ifdef __x86_64__ +/** Find the first set bit starting from the lsb. Result is undefined if + * input is 0. + * + * \headerfile + * + * This intrinsic corresponds to the BSF instruction or the + * TZCNT instruction. + * + * \param __A + * A 64-bit integer operand. + * \returns A 32-bit integer containing the bit number. + */ +static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR +__bsfq(long long __A) { + return __builtin_ctzll(__A); +} + +/** Find the first set bit starting from the msb. Result is undefined if + * input is 0. + * + * \headerfile + * + * This intrinsic corresponds to the BSR instruction or the + * LZCNT instruction and an XOR . + * + * \param __A + * A 64-bit integer operand. + * \returns A 32-bit integer containing the bit number. + */ +static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR +__bsrq(long long __A) { + return 63 - __builtin_clzll(__A); +} + +/** Swaps the bytes in the input. Converting little endian to big endian or + * vice versa. + * + * \headerfile + * + * This intrinsic corresponds to the BSWAP instruction. + * + * \param __A + * A 64-bit integer operand. + * \returns A 64-bit integer containing the swapped bytes. + */ +static __inline__ long long __DEFAULT_FN_ATTRS_CONSTEXPR +__bswapq(long long __A) { + return __builtin_bswap64(__A); +} + +#define _bswap64(A) __bswapq((A)) +#endif + +/** Counts the number of bits in the source operand having a value of 1. + * + * \headerfile + * + * This intrinsic corresponds to the POPCNT instruction or a + * a sequence of arithmetic and logic ops to calculate it. + * + * \param __A + * An unsigned 32-bit integer operand. + * \returns A 32-bit integer containing the number of bits with value 1 in the + * source operand. + */ +static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR +__popcntd(unsigned int __A) +{ + return __builtin_popcount(__A); +} + +#define _popcnt32(A) __popcntd((A)) + +#ifdef __x86_64__ +/** Counts the number of bits in the source operand having a value of 1. + * + * \headerfile + * + * This intrinsic corresponds to the POPCNT instruction or a + * a sequence of arithmetic and logic ops to calculate it. + * + * \param __A + * An unsigned 64-bit integer operand. + * \returns A 64-bit integer containing the number of bits with value 1 in the + * source operand. + */ +static __inline__ long long __DEFAULT_FN_ATTRS_CONSTEXPR +__popcntq(unsigned long long __A) +{ + return __builtin_popcountll(__A); +} + +#define _popcnt64(A) __popcntq((A)) +#endif /* __x86_64__ */ + +#ifdef __x86_64__ +static __inline__ unsigned long long __DEFAULT_FN_ATTRS +__readeflags(void) +{ + return __builtin_ia32_readeflags_u64(); +} + +static __inline__ void __DEFAULT_FN_ATTRS +__writeeflags(unsigned long long __f) +{ + __builtin_ia32_writeeflags_u64(__f); +} + +#else /* !__x86_64__ */ +static __inline__ unsigned int __DEFAULT_FN_ATTRS +__readeflags(void) +{ + return __builtin_ia32_readeflags_u32(); +} + +static __inline__ void __DEFAULT_FN_ATTRS +__writeeflags(unsigned int __f) +{ + __builtin_ia32_writeeflags_u32(__f); +} +#endif /* !__x86_64__ */ + +/** Cast a 32-bit float value to a 32-bit unsigned integer value + * + * \headerfile + * This intrinsic corresponds to the VMOVD / MOVD instruction in x86_64, + * and corresponds to the VMOVL / MOVL instruction in ia32. + * + * \param __A + * A 32-bit float value. + * \returns a 32-bit unsigned integer containing the converted value. + */ +static __inline__ unsigned int __DEFAULT_FN_ATTRS_CAST +_castf32_u32(float __A) { + return __builtin_bit_cast(unsigned int, __A); +} + +/** Cast a 64-bit float value to a 64-bit unsigned integer value + * + * \headerfile + * This intrinsic corresponds to the VMOVQ / MOVQ instruction in x86_64, + * and corresponds to the VMOVL / MOVL instruction in ia32. + * + * \param __A + * A 64-bit float value. + * \returns a 64-bit unsigned integer containing the converted value. + */ +static __inline__ unsigned long long __DEFAULT_FN_ATTRS_CAST +_castf64_u64(double __A) { + return __builtin_bit_cast(unsigned long long, __A); +} + +/** Cast a 32-bit unsigned integer value to a 32-bit float value + * + * \headerfile + * This intrinsic corresponds to the VMOVQ / MOVQ instruction in x86_64, + * and corresponds to the FLDS instruction in ia32. + * + * \param __A + * A 32-bit unsigned integer value. + * \returns a 32-bit float value containing the converted value. + */ +static __inline__ float __DEFAULT_FN_ATTRS_CAST +_castu32_f32(unsigned int __A) { + return __builtin_bit_cast(float, __A); +} + +/** Cast a 64-bit unsigned integer value to a 64-bit float value + * + * \headerfile + * This intrinsic corresponds to the VMOVQ / MOVQ instruction in x86_64, + * and corresponds to the FLDL instruction in ia32. + * + * \param __A + * A 64-bit unsigned integer value. + * \returns a 64-bit float value containing the converted value. + */ +static __inline__ double __DEFAULT_FN_ATTRS_CAST +_castu64_f64(unsigned long long __A) { + return __builtin_bit_cast(double, __A); +} + +/** Adds the unsigned integer operand to the CRC-32C checksum of the + * unsigned char operand. + * + * \headerfile + * + * This intrinsic corresponds to the CRC32B instruction. + * + * \param __C + * An unsigned integer operand to add to the CRC-32C checksum of operand + * \a __D. + * \param __D + * An unsigned 8-bit integer operand used to compute the CRC-32C checksum. + * \returns The result of adding operand \a __C to the CRC-32C checksum of + * operand \a __D. + */ +static __inline__ unsigned int __DEFAULT_FN_ATTRS_CRC32 +__crc32b(unsigned int __C, unsigned char __D) +{ + return __builtin_ia32_crc32qi(__C, __D); +} + +/** Adds the unsigned integer operand to the CRC-32C checksum of the + * unsigned short operand. + * + * \headerfile + * + * This intrinsic corresponds to the CRC32W instruction. + * + * \param __C + * An unsigned integer operand to add to the CRC-32C checksum of operand + * \a __D. + * \param __D + * An unsigned 16-bit integer operand used to compute the CRC-32C checksum. + * \returns The result of adding operand \a __C to the CRC-32C checksum of + * operand \a __D. + */ +static __inline__ unsigned int __DEFAULT_FN_ATTRS_CRC32 +__crc32w(unsigned int __C, unsigned short __D) +{ + return __builtin_ia32_crc32hi(__C, __D); +} + +/** Adds the unsigned integer operand to the CRC-32C checksum of the + * second unsigned integer operand. + * + * \headerfile + * + * This intrinsic corresponds to the CRC32D instruction. + * + * \param __C + * An unsigned integer operand to add to the CRC-32C checksum of operand + * \a __D. + * \param __D + * An unsigned 32-bit integer operand used to compute the CRC-32C checksum. + * \returns The result of adding operand \a __C to the CRC-32C checksum of + * operand \a __D. + */ +static __inline__ unsigned int __DEFAULT_FN_ATTRS_CRC32 +__crc32d(unsigned int __C, unsigned int __D) +{ + return __builtin_ia32_crc32si(__C, __D); +} + +#ifdef __x86_64__ +/** Adds the unsigned integer operand to the CRC-32C checksum of the + * unsigned 64-bit integer operand. + * + * \headerfile + * + * This intrinsic corresponds to the CRC32Q instruction. + * + * \param __C + * An unsigned integer operand to add to the CRC-32C checksum of operand + * \a __D. + * \param __D + * An unsigned 64-bit integer operand used to compute the CRC-32C checksum. + * \returns The result of adding operand \a __C to the CRC-32C checksum of + * operand \a __D. + */ +static __inline__ unsigned long long __DEFAULT_FN_ATTRS_CRC32 +__crc32q(unsigned long long __C, unsigned long long __D) +{ + return __builtin_ia32_crc32di(__C, __D); +} +#endif /* __x86_64__ */ + +static __inline__ unsigned long long __DEFAULT_FN_ATTRS +__rdpmc(int __A) { + return __builtin_ia32_rdpmc(__A); +} + +/* __rdtscp */ +static __inline__ unsigned long long __DEFAULT_FN_ATTRS +__rdtscp(unsigned int *__A) { + return __builtin_ia32_rdtscp(__A); +} + +#define _rdtsc() __rdtsc() + +#define _rdpmc(A) __rdpmc(A) + +static __inline__ void __DEFAULT_FN_ATTRS +_wbinvd(void) { + __builtin_ia32_wbinvd(); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR +__rolb(unsigned char __X, int __C) { + return __builtin_rotateleft8(__X, __C); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR +__rorb(unsigned char __X, int __C) { + return __builtin_rotateright8(__X, __C); +} + +static __inline__ unsigned short __DEFAULT_FN_ATTRS_CONSTEXPR +__rolw(unsigned short __X, int __C) { + return __builtin_rotateleft16(__X, __C); +} + +static __inline__ unsigned short __DEFAULT_FN_ATTRS_CONSTEXPR +__rorw(unsigned short __X, int __C) { + return __builtin_rotateright16(__X, __C); +} + +static __inline__ unsigned int __DEFAULT_FN_ATTRS_CONSTEXPR +__rold(unsigned int __X, int __C) { + return __builtin_rotateleft32(__X, __C); +} + +static __inline__ unsigned int __DEFAULT_FN_ATTRS_CONSTEXPR +__rord(unsigned int __X, int __C) { + return __builtin_rotateright32(__X, __C); +} + +#ifdef __x86_64__ +static __inline__ unsigned long long __DEFAULT_FN_ATTRS_CONSTEXPR +__rolq(unsigned long long __X, int __C) { + return __builtin_rotateleft64(__X, __C); +} + +static __inline__ unsigned long long __DEFAULT_FN_ATTRS_CONSTEXPR +__rorq(unsigned long long __X, int __C) { + return __builtin_rotateright64(__X, __C); +} +#endif /* __x86_64__ */ + +#ifndef _MSC_VER +/* These are already provided as builtins for MSVC. */ +/* Select the correct function based on the size of long. */ +#ifdef __LP64__ +#define _lrotl(a,b) __rolq((a), (b)) +#define _lrotr(a,b) __rorq((a), (b)) +#else +#define _lrotl(a,b) __rold((a), (b)) +#define _lrotr(a,b) __rord((a), (b)) +#endif +#define _rotl(a,b) __rold((a), (b)) +#define _rotr(a,b) __rord((a), (b)) +#endif // _MSC_VER + +/* These are not builtins so need to be provided in all modes. */ +#define _rotwl(a,b) __rolw((a), (b)) +#define _rotwr(a,b) __rorw((a), (b)) + +#undef __DEFAULT_FN_ATTRS +#undef __DEFAULT_FN_ATTRS_CAST +#undef __DEFAULT_FN_ATTRS_CRC32 +#undef __DEFAULT_FN_ATTRS_CONSTEXPR + +#endif /* __IA32INTRIN_H */ diff --git a/include-llvm/immintrin.h b/include-llvm/immintrin.h new file mode 100644 index 0000000..e5174f8 --- /dev/null +++ b/include-llvm/immintrin.h @@ -0,0 +1,618 @@ +/*===---- immintrin.h - Intel intrinsics -----------------------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#define __IMMINTRIN_H + +#if !defined(__i386__) && !defined(__x86_64__) +#error "This header is only meant to be used on x86 and x64 architecture" +#endif + +#include + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__MMX__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__SSE__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__SSE2__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__SSE3__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__SSSE3__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + (defined(__SSE4_2__) || defined(__SSE4_1__)) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + (defined(__AES__) || defined(__PCLMUL__)) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__CLFLUSHOPT__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__CLWB__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__AVX__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__AVX2__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__F16C__) +#include +#endif + +/* No feature check desired due to internal checks */ +#include + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__BMI2__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__LZCNT__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__POPCNT__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__FMA__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__AVX512F__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__AVX512VL__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__AVX512BW__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__AVX512BITALG__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__AVX512CD__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__AVX512VPOPCNTDQ__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + (defined(__AVX512VL__) && defined(__AVX512VPOPCNTDQ__)) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__AVX512VNNI__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + (defined(__AVX512VL__) && defined(__AVX512VNNI__)) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__AVXVNNI__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__AVX512DQ__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + (defined(__AVX512VL__) && defined(__AVX512BITALG__)) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + (defined(__AVX512VL__) && defined(__AVX512BW__)) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + (defined(__AVX512VL__) && defined(__AVX512CD__)) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + (defined(__AVX512VL__) && defined(__AVX512DQ__)) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__AVX512ER__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__AVX512IFMA__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + (defined(__AVX512IFMA__) && defined(__AVX512VL__)) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__AVX512VBMI__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + (defined(__AVX512VBMI__) && defined(__AVX512VL__)) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__AVX512VBMI2__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + (defined(__AVX512VBMI2__) && defined(__AVX512VL__)) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__AVX512PF__) +#include +#endif + +/* + * FIXME: _Float16 type is legal only when HW support float16 operation. + * We use __AVX512FP16__ to identify if float16 is supported or not, so + * when float16 is not supported, the related header is not included. + * + */ +#if defined(__AVX512FP16__) +#include +#endif + +#if defined(__AVX512FP16__) && defined(__AVX512VL__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__AVX512BF16__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + (defined(__AVX512VL__) && defined(__AVX512BF16__)) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__PKU__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__VPCLMULQDQ__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__VAES__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__GFNI__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__RDPID__) +/// Returns the value of the IA32_TSC_AUX MSR (0xc0000103). +/// +/// \headerfile +/// +/// This intrinsic corresponds to the RDPID instruction. +static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("rdpid"))) +_rdpid_u32(void) { + return __builtin_ia32_rdpid(); +} +#endif // __RDPID__ + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__RDRND__) +static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd"))) +_rdrand16_step(unsigned short *__p) +{ + return __builtin_ia32_rdrand16_step(__p); +} + +static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd"))) +_rdrand32_step(unsigned int *__p) +{ + return __builtin_ia32_rdrand32_step(__p); +} + +#ifdef __x86_64__ +static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd"))) +_rdrand64_step(unsigned long long *__p) +{ + return __builtin_ia32_rdrand64_step(__p); +} +#endif +#endif /* __RDRND__ */ + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__FSGSBASE__) +#ifdef __x86_64__ +static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase"))) +_readfsbase_u32(void) +{ + return __builtin_ia32_rdfsbase32(); +} + +static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase"))) +_readfsbase_u64(void) +{ + return __builtin_ia32_rdfsbase64(); +} + +static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase"))) +_readgsbase_u32(void) +{ + return __builtin_ia32_rdgsbase32(); +} + +static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase"))) +_readgsbase_u64(void) +{ + return __builtin_ia32_rdgsbase64(); +} + +static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase"))) +_writefsbase_u32(unsigned int __V) +{ + __builtin_ia32_wrfsbase32(__V); +} + +static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase"))) +_writefsbase_u64(unsigned long long __V) +{ + __builtin_ia32_wrfsbase64(__V); +} + +static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase"))) +_writegsbase_u32(unsigned int __V) +{ + __builtin_ia32_wrgsbase32(__V); +} + +static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase"))) +_writegsbase_u64(unsigned long long __V) +{ + __builtin_ia32_wrgsbase64(__V); +} + +#endif +#endif /* __FSGSBASE__ */ + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__MOVBE__) + +/* The structs used below are to force the load/store to be unaligned. This + * is accomplished with the __packed__ attribute. The __may_alias__ prevents + * tbaa metadata from being generated based on the struct and the type of the + * field inside of it. + */ + +static __inline__ short __attribute__((__always_inline__, __nodebug__, __target__("movbe"))) +_loadbe_i16(void const * __P) { + struct __loadu_i16 { + short __v; + } __attribute__((__packed__, __may_alias__)); + return __builtin_bswap16(((const struct __loadu_i16*)__P)->__v); +} + +static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("movbe"))) +_storebe_i16(void * __P, short __D) { + struct __storeu_i16 { + short __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_i16*)__P)->__v = __builtin_bswap16(__D); +} + +static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("movbe"))) +_loadbe_i32(void const * __P) { + struct __loadu_i32 { + int __v; + } __attribute__((__packed__, __may_alias__)); + return __builtin_bswap32(((const struct __loadu_i32*)__P)->__v); +} + +static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("movbe"))) +_storebe_i32(void * __P, int __D) { + struct __storeu_i32 { + int __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_i32*)__P)->__v = __builtin_bswap32(__D); +} + +#ifdef __x86_64__ +static __inline__ long long __attribute__((__always_inline__, __nodebug__, __target__("movbe"))) +_loadbe_i64(void const * __P) { + struct __loadu_i64 { + long long __v; + } __attribute__((__packed__, __may_alias__)); + return __builtin_bswap64(((const struct __loadu_i64*)__P)->__v); +} + +static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("movbe"))) +_storebe_i64(void * __P, long long __D) { + struct __storeu_i64 { + long long __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_i64*)__P)->__v = __builtin_bswap64(__D); +} +#endif +#endif /* __MOVBE */ + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__RTM__) +#include +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__SHA__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__FXSR__) +#include +#endif + +/* No feature check desired due to internal MSC_VER checks */ +#include + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__XSAVEOPT__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__XSAVEC__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__XSAVES__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__SHSTK__) +#include +#endif + +/* Some intrinsics inside adxintrin.h are available only on processors with ADX, + * whereas others are also available at all times. */ +#include + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__RDSEED__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__WBNOINVD__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__CLDEMOTE__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__WAITPKG__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__MOVDIRI__) || defined(__MOVDIR64B__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__PCONFIG__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__SGX__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__PTWRITE__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__INVPCID__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__KL__) || defined(__WIDEKL__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__AMXTILE__) || defined(__AMXINT8__) || defined(__AMXBF16__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__AVX512VP2INTERSECT__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + (defined(__AVX512VL__) && defined(__AVX512VP2INTERSECT__)) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__ENQCMD__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__SERIALIZE__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__TSXLDTRK__) +#include +#endif + +#if defined(_MSC_VER) && __has_extension(gnu_asm) +/* Define the default attributes for these intrinsics */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__)) +#ifdef __cplusplus +extern "C" { +#endif +/*----------------------------------------------------------------------------*\ +|* Interlocked Exchange HLE +\*----------------------------------------------------------------------------*/ +#if defined(__i386__) || defined(__x86_64__) +static __inline__ long __DEFAULT_FN_ATTRS +_InterlockedExchange_HLEAcquire(long volatile *_Target, long _Value) { + __asm__ __volatile__(".byte 0xf2 ; lock ; xchg {%0, %1|%1, %0}" + : "+r" (_Value), "+m" (*_Target) :: "memory"); + return _Value; +} +static __inline__ long __DEFAULT_FN_ATTRS +_InterlockedExchange_HLERelease(long volatile *_Target, long _Value) { + __asm__ __volatile__(".byte 0xf3 ; lock ; xchg {%0, %1|%1, %0}" + : "+r" (_Value), "+m" (*_Target) :: "memory"); + return _Value; +} +#endif +#if defined(__x86_64__) +static __inline__ __int64 __DEFAULT_FN_ATTRS +_InterlockedExchange64_HLEAcquire(__int64 volatile *_Target, __int64 _Value) { + __asm__ __volatile__(".byte 0xf2 ; lock ; xchg {%0, %1|%1, %0}" + : "+r" (_Value), "+m" (*_Target) :: "memory"); + return _Value; +} +static __inline__ __int64 __DEFAULT_FN_ATTRS +_InterlockedExchange64_HLERelease(__int64 volatile *_Target, __int64 _Value) { + __asm__ __volatile__(".byte 0xf3 ; lock ; xchg {%0, %1|%1, %0}" + : "+r" (_Value), "+m" (*_Target) :: "memory"); + return _Value; +} +#endif +/*----------------------------------------------------------------------------*\ +|* Interlocked Compare Exchange HLE +\*----------------------------------------------------------------------------*/ +#if defined(__i386__) || defined(__x86_64__) +static __inline__ long __DEFAULT_FN_ATTRS +_InterlockedCompareExchange_HLEAcquire(long volatile *_Destination, + long _Exchange, long _Comparand) { + __asm__ __volatile__(".byte 0xf2 ; lock ; cmpxchg {%2, %1|%1, %2}" + : "+a" (_Comparand), "+m" (*_Destination) + : "r" (_Exchange) : "memory"); + return _Comparand; +} +static __inline__ long __DEFAULT_FN_ATTRS +_InterlockedCompareExchange_HLERelease(long volatile *_Destination, + long _Exchange, long _Comparand) { + __asm__ __volatile__(".byte 0xf3 ; lock ; cmpxchg {%2, %1|%1, %2}" + : "+a" (_Comparand), "+m" (*_Destination) + : "r" (_Exchange) : "memory"); + return _Comparand; +} +#endif +#if defined(__x86_64__) +static __inline__ __int64 __DEFAULT_FN_ATTRS +_InterlockedCompareExchange64_HLEAcquire(__int64 volatile *_Destination, + __int64 _Exchange, __int64 _Comparand) { + __asm__ __volatile__(".byte 0xf2 ; lock ; cmpxchg {%2, %1|%1, %2}" + : "+a" (_Comparand), "+m" (*_Destination) + : "r" (_Exchange) : "memory"); + return _Comparand; +} +static __inline__ __int64 __DEFAULT_FN_ATTRS +_InterlockedCompareExchange64_HLERelease(__int64 volatile *_Destination, + __int64 _Exchange, __int64 _Comparand) { + __asm__ __volatile__(".byte 0xf3 ; lock ; cmpxchg {%2, %1|%1, %2}" + : "+a" (_Comparand), "+m" (*_Destination) + : "r" (_Exchange) : "memory"); + return _Comparand; +} +#endif +#ifdef __cplusplus +} +#endif + +#undef __DEFAULT_FN_ATTRS + +#endif /* defined(_MSC_VER) && __has_extension(gnu_asm) */ + +#endif /* __IMMINTRIN_H */ diff --git a/include-llvm/invpcidintrin.h b/include-llvm/invpcidintrin.h new file mode 100644 index 0000000..48dae0a --- /dev/null +++ b/include-llvm/invpcidintrin.h @@ -0,0 +1,23 @@ +/*===------------- invpcidintrin.h - INVPCID intrinsic ---------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __INVPCIDINTRIN_H +#define __INVPCIDINTRIN_H + +static __inline__ void + __attribute__((__always_inline__, __nodebug__, __target__("invpcid"))) +_invpcid(unsigned int __type, void *__descriptor) { + __builtin_ia32_invpcid(__type, __descriptor); +} + +#endif /* __INVPCIDINTRIN_H */ diff --git a/include-llvm/keylockerintrin.h b/include-llvm/keylockerintrin.h new file mode 100644 index 0000000..ad9428e --- /dev/null +++ b/include-llvm/keylockerintrin.h @@ -0,0 +1,530 @@ +/*===----------------- keylockerintrin.h - KL Intrinsics -------------------=== + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef _KEYLOCKERINTRIN_H +#define _KEYLOCKERINTRIN_H + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__KL__) + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("kl"),\ + __min_vector_width__(128))) + +/// Load internal wrapping key from __intkey, __enkey_lo and __enkey_hi. __ctl +/// will assigned to EAX, whch specifies the KeySource and whether backing up +/// the key is permitted. The 256-bit encryption key is loaded from the two +/// explicit operands (__enkey_lo and __enkey_hi). The 128-bit integrity key is +/// loaded from the implicit operand XMM0 which assigned by __intkey. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the LOADIWKEY instructions. +/// +/// \operation +/// IF CPL > 0 // LOADKWKEY only allowed at ring 0 (supervisor mode) +/// GP (0) +/// FI +/// IF “LOADIWKEY exiting” VM execution control set +/// VMexit +/// FI +/// IF __ctl[4:1] > 1 // Reserved KeySource encoding used +/// GP (0) +/// FI +/// IF __ctl[31:5] != 0 // Reserved bit in __ctl is set +/// GP (0) +/// FI +/// IF __ctl[0] AND (CPUID.19H.ECX[0] == 0) // NoBackup is not supported on this part +/// GP (0) +/// FI +/// IF (__ctl[4:1] == 1) AND (CPUID.19H.ECX[1] == 0) // KeySource of 1 is not supported on this part +/// GP (0) +/// FI +/// IF (__ctl[4:1] == 0) // KeySource of 0. +/// IWKey.Encryption Key[127:0] := __enkey_hi[127:0]: +/// IWKey.Encryption Key[255:128] := __enkey_lo[127:0] +/// IWKey.IntegrityKey[127:0] := __intkey[127:0] +/// IWKey.NoBackup := __ctl[0] +/// IWKey.KeySource := __ctl[4:1] +/// ZF := 0 +/// ELSE // KeySource of 1. See RDSEED definition for details of randomness +/// IF HW_NRND_GEN.ready == 1 // Full-entropy random data from RDSEED was received +/// IWKey.Encryption Key[127:0] := __enkey_hi[127:0] XOR HW_NRND_GEN.data[127:0] +/// IWKey.Encryption Key[255:128] := __enkey_lo[127:0] XOR HW_NRND_GEN.data[255:128] +/// IWKey.Encryption Key[255:0] := __enkey_hi[127:0]:__enkey_lo[127:0] XOR HW_NRND_GEN.data[255:0] +/// IWKey.IntegrityKey[127:0] := __intkey[127:0] XOR HW_NRND_GEN.data[383:256] +/// IWKey.NoBackup := __ctl[0] +/// IWKey.KeySource := __ctl[4:1] +/// ZF := 0 +/// ELSE // Random data was not returned from RDSEED. IWKey was not loaded +/// ZF := 1 +/// FI +/// FI +/// dst := ZF +/// OF := 0 +/// SF := 0 +/// AF := 0 +/// PF := 0 +/// CF := 0 +/// \endoperation +static __inline__ void __DEFAULT_FN_ATTRS +_mm_loadiwkey (unsigned int __ctl, __m128i __intkey, + __m128i __enkey_lo, __m128i __enkey_hi) { + __builtin_ia32_loadiwkey (__intkey, __enkey_lo, __enkey_hi, __ctl); +} + +/// Wrap a 128-bit AES key from __key into a key handle and output in +/// ((__m128i*)__h) to ((__m128i*)__h) + 2 and a 32-bit value as return. +/// The explicit source operand __htype specifies handle restrictions. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the ENCODEKEY128 instructions. +/// +/// \operation +/// InputKey[127:0] := __key[127:0] +/// KeyMetadata[2:0] := __htype[2:0] +/// KeyMetadata[23:3] := 0 // Reserved for future usage +/// KeyMetadata[27:24] := 0 // KeyType is AES-128 (value of 0) +/// KeyMetadata[127:28] := 0 // Reserved for future usage +/// Handle[383:0] := WrapKey128(InputKey[127:0], KeyMetadata[127:0], +/// IWKey.Integrity Key[127:0], IWKey.Encryption Key[255:0]) +/// dst[0] := IWKey.NoBackup +/// dst[4:1] := IWKey.KeySource[3:0] +/// dst[31:5] := 0 +/// MEM[__h+127:__h] := Handle[127:0] // AAD +/// MEM[__h+255:__h+128] := Handle[255:128] // Integrity Tag +/// MEM[__h+383:__h+256] := Handle[383:256] // CipherText +/// OF := 0 +/// SF := 0 +/// ZF := 0 +/// AF := 0 +/// PF := 0 +/// CF := 0 +/// \endoperation +static __inline__ unsigned int __DEFAULT_FN_ATTRS +_mm_encodekey128_u32(unsigned int __htype, __m128i __key, void *__h) { + return __builtin_ia32_encodekey128_u32(__htype, (__v2di)__key, __h); +} + +/// Wrap a 256-bit AES key from __key_hi:__key_lo into a key handle, then +/// output handle in ((__m128i*)__h) to ((__m128i*)__h) + 3 and +/// a 32-bit value as return. +/// The explicit source operand __htype specifies handle restrictions. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the ENCODEKEY256 instructions. +/// +/// \operation +/// InputKey[127:0] := __key_lo[127:0] +/// InputKey[255:128] := __key_hi[255:128] +/// KeyMetadata[2:0] := __htype[2:0] +/// KeyMetadata[23:3] := 0 // Reserved for future usage +/// KeyMetadata[27:24] := 1 // KeyType is AES-256 (value of 1) +/// KeyMetadata[127:28] := 0 // Reserved for future usage +/// Handle[511:0] := WrapKey256(InputKey[255:0], KeyMetadata[127:0], +/// IWKey.Integrity Key[127:0], IWKey.Encryption Key[255:0]) +/// dst[0] := IWKey.NoBackup +/// dst[4:1] := IWKey.KeySource[3:0] +/// dst[31:5] := 0 +/// MEM[__h+127:__h] := Handle[127:0] // AAD +/// MEM[__h+255:__h+128] := Handle[255:128] // Tag +/// MEM[__h+383:__h+256] := Handle[383:256] // CipherText[127:0] +/// MEM[__h+511:__h+384] := Handle[511:384] // CipherText[255:128] +/// OF := 0 +/// SF := 0 +/// ZF := 0 +/// AF := 0 +/// PF := 0 +/// CF := 0 +/// \endoperation +static __inline__ unsigned int __DEFAULT_FN_ATTRS +_mm_encodekey256_u32(unsigned int __htype, __m128i __key_lo, __m128i __key_hi, + void *__h) { + return __builtin_ia32_encodekey256_u32(__htype, (__v2di)__key_lo, + (__v2di)__key_hi, __h); +} + +/// The AESENC128KL performs 10 rounds of AES to encrypt the __idata using +/// the 128-bit key in the handle from the __h. It stores the result in the +/// __odata. And return the affected ZF flag status. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the AESENC128KL instructions. +/// +/// \operation +/// Handle[383:0] := MEM[__h+383:__h] // Load is not guaranteed to be atomic. +/// IllegalHandle := ( HandleReservedBitSet (Handle[383:0]) || +/// (Handle[127:0] AND (CPL > 0)) || +/// Handle[383:256] || +/// HandleKeyType (Handle[383:0]) != HANDLE_KEY_TYPE_AES128 ) +/// IF (IllegalHandle) +/// ZF := 1 +/// ELSE +/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey) +/// IF (Authentic == 0) +/// ZF := 1 +/// ELSE +/// MEM[__odata+127:__odata] := AES128Encrypt (__idata[127:0], UnwrappedKey) +/// ZF := 0 +/// FI +/// FI +/// dst := ZF +/// OF := 0 +/// SF := 0 +/// AF := 0 +/// PF := 0 +/// CF := 0 +/// \endoperation +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_mm_aesenc128kl_u8(__m128i* __odata, __m128i __idata, const void *__h) { + return __builtin_ia32_aesenc128kl_u8((__v2di *)__odata, (__v2di)__idata, __h); +} + +/// The AESENC256KL performs 14 rounds of AES to encrypt the __idata using +/// the 256-bit key in the handle from the __h. It stores the result in the +/// __odata. And return the affected ZF flag status. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the AESENC256KL instructions. +/// +/// \operation +/// Handle[511:0] := MEM[__h+511:__h] // Load is not guaranteed to be atomic. +/// IllegalHandle := ( HandleReservedBitSet (Handle[511:0]) || +/// (Handle[127:0] AND (CPL > 0)) || +/// Handle[255:128] || +/// HandleKeyType (Handle[511:0]) != HANDLE_KEY_TYPE_AES256 ) +/// IF (IllegalHandle) +/// ZF := 1 +/// MEM[__odata+127:__odata] := 0 +/// ELSE +/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey) +/// IF (Authentic == 0) +/// ZF := 1 +/// MEM[__odata+127:__odata] := 0 +/// ELSE +/// MEM[__odata+127:__odata] := AES256Encrypt (__idata[127:0], UnwrappedKey) +/// ZF := 0 +/// FI +/// FI +/// dst := ZF +/// OF := 0 +/// SF := 0 +/// AF := 0 +/// PF := 0 +/// CF := 0 +/// \endoperation +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_mm_aesenc256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) { + return __builtin_ia32_aesenc256kl_u8((__v2di *)__odata, (__v2di)__idata, __h); +} + +/// The AESDEC128KL performs 10 rounds of AES to decrypt the __idata using +/// the 128-bit key in the handle from the __h. It stores the result in the +/// __odata. And return the affected ZF flag status. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the AESDEC128KL instructions. +/// +/// \operation +/// Handle[383:0] := MEM[__h+383:__h] // Load is not guaranteed to be atomic. +/// IllegalHandle := (HandleReservedBitSet (Handle[383:0]) || +/// (Handle[127:0] AND (CPL > 0)) || +/// Handle[383:256] || +/// HandleKeyType (Handle[383:0]) != HANDLE_KEY_TYPE_AES128) +/// IF (IllegalHandle) +/// ZF := 1 +/// MEM[__odata+127:__odata] := 0 +/// ELSE +/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey) +/// IF (Authentic == 0) +/// ZF := 1 +/// MEM[__odata+127:__odata] := 0 +/// ELSE +/// MEM[__odata+127:__odata] := AES128Decrypt (__idata[127:0], UnwrappedKey) +/// ZF := 0 +/// FI +/// FI +/// dst := ZF +/// OF := 0 +/// SF := 0 +/// AF := 0 +/// PF := 0 +/// CF := 0 +/// \endoperation +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_mm_aesdec128kl_u8(__m128i* __odata, __m128i __idata, const void *__h) { + return __builtin_ia32_aesdec128kl_u8((__v2di *)__odata, (__v2di)__idata, __h); +} + +/// The AESDEC256KL performs 10 rounds of AES to decrypt the __idata using +/// the 256-bit key in the handle from the __h. It stores the result in the +/// __odata. And return the affected ZF flag status. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the AESDEC256KL instructions. +/// +/// \operation +/// Handle[511:0] := MEM[__h+511:__h] +/// IllegalHandle := (HandleReservedBitSet (Handle[511:0]) || +/// (Handle[127:0] AND (CPL > 0)) || +/// Handle[383:256] || +/// HandleKeyType (Handle[511:0]) != HANDLE_KEY_TYPE_AES256) +/// IF (IllegalHandle) +/// ZF := 1 +/// MEM[__odata+127:__odata] := 0 +/// ELSE +/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey) +/// IF (Authentic == 0) +/// ZF := 1 +/// MEM[__odata+127:__odata] := 0 +/// ELSE +/// MEM[__odata+127:__odata] := AES256Decrypt (__idata[127:0], UnwrappedKey) +/// ZF := 0 +/// FI +/// FI +/// dst := ZF +/// OF := 0 +/// SF := 0 +/// AF := 0 +/// PF := 0 +/// CF := 0 +/// \endoperation +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_mm_aesdec256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) { + return __builtin_ia32_aesdec256kl_u8((__v2di *)__odata, (__v2di)__idata, __h); +} + +#undef __DEFAULT_FN_ATTRS + +#endif /* !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) \ + || defined(__KL__) */ + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__WIDEKL__) + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("kl,widekl"),\ + __min_vector_width__(128))) + +/// Encrypt __idata[0] to __idata[7] using 128-bit AES key indicated by handle +/// at __h and store each resultant block back from __odata to __odata+7. And +/// return the affected ZF flag status. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the AESENCWIDE128KL instructions. +/// +/// \operation +/// Handle := MEM[__h+383:__h] +/// IllegalHandle := ( HandleReservedBitSet (Handle[383:0]) || +/// (Handle[127:0] AND (CPL > 0)) || +/// Handle[255:128] || +/// HandleKeyType (Handle[383:0]) != HANDLE_KEY_TYPE_AES128 ) +/// IF (IllegalHandle) +/// ZF := 1 +/// FOR i := 0 to 7 +/// __odata[i] := 0 +/// ENDFOR +/// ELSE +/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey) +/// IF Authentic == 0 +/// ZF := 1 +/// FOR i := 0 to 7 +/// __odata[i] := 0 +/// ENDFOR +/// ELSE +/// FOR i := 0 to 7 +/// __odata[i] := AES128Encrypt (__idata[i], UnwrappedKey) +/// ENDFOR +/// ZF := 0 +/// FI +/// FI +/// dst := ZF +/// OF := 0 +/// SF := 0 +/// AF := 0 +/// PF := 0 +/// CF := 0 +/// \endoperation +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_mm_aesencwide128kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) { + return __builtin_ia32_aesencwide128kl_u8((__v2di *)__odata, + (const __v2di *)__idata, __h); +} + +/// Encrypt __idata[0] to __idata[7] using 256-bit AES key indicated by handle +/// at __h and store each resultant block back from __odata to __odata+7. And +/// return the affected ZF flag status. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the AESENCWIDE256KL instructions. +/// +/// \operation +/// Handle[511:0] := MEM[__h+511:__h] +/// IllegalHandle := ( HandleReservedBitSet (Handle[511:0]) || +/// (Handle[127:0] AND (CPL > 0)) || +/// Handle[255:128] || +/// HandleKeyType (Handle[511:0]) != HANDLE_KEY_TYPE_AES512 ) +/// IF (IllegalHandle) +/// ZF := 1 +/// FOR i := 0 to 7 +/// __odata[i] := 0 +/// ENDFOR +/// ELSE +/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey) +/// IF Authentic == 0 +/// ZF := 1 +/// FOR i := 0 to 7 +/// __odata[i] := 0 +/// ENDFOR +/// ELSE +/// FOR i := 0 to 7 +/// __odata[i] := AES256Encrypt (__idata[i], UnwrappedKey) +/// ENDFOR +/// ZF := 0 +/// FI +/// FI +/// dst := ZF +/// OF := 0 +/// SF := 0 +/// AF := 0 +/// PF := 0 +/// CF := 0 +/// \endoperation +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_mm_aesencwide256kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) { + return __builtin_ia32_aesencwide256kl_u8((__v2di *)__odata, + (const __v2di *)__idata, __h); +} + +/// Decrypt __idata[0] to __idata[7] using 128-bit AES key indicated by handle +/// at __h and store each resultant block back from __odata to __odata+7. And +/// return the affected ZF flag status. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the AESDECWIDE128KL instructions. +/// +/// \operation +/// Handle[383:0] := MEM[__h+383:__h] +/// IllegalHandle := ( HandleReservedBitSet (Handle[383:0]) || +/// (Handle[127:0] AND (CPL > 0)) || +/// Handle[255:128] || +/// HandleKeyType (Handle) != HANDLE_KEY_TYPE_AES128 ) +/// IF (IllegalHandle) +/// ZF := 1 +/// FOR i := 0 to 7 +/// __odata[i] := 0 +/// ENDFOR +/// ELSE +/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey) +/// IF Authentic == 0 +/// ZF := 1 +/// FOR i := 0 to 7 +/// __odata[i] := 0 +/// ENDFOR +/// ELSE +/// FOR i := 0 to 7 +/// __odata[i] := AES128Decrypt (__idata[i], UnwrappedKey) +/// ENDFOR +/// ZF := 0 +/// FI +/// FI +/// dst := ZF +/// OF := 0 +/// SF := 0 +/// AF := 0 +/// PF := 0 +/// CF := 0 +/// \endoperation +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_mm_aesdecwide128kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) { + return __builtin_ia32_aesdecwide128kl_u8((__v2di *)__odata, + (const __v2di *)__idata, __h); +} + +/// Decrypt __idata[0] to __idata[7] using 256-bit AES key indicated by handle +/// at __h and store each resultant block back from __odata to __odata+7. And +/// return the affected ZF flag status. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the AESDECWIDE256KL instructions. +/// +/// \operation +/// Handle[511:0] := MEM[__h+511:__h] +/// IllegalHandle = ( HandleReservedBitSet (Handle[511:0]) || +/// (Handle[127:0] AND (CPL > 0)) || +/// Handle[255:128] || +/// HandleKeyType (Handle) != HANDLE_KEY_TYPE_AES512 ) +/// If (IllegalHandle) +/// ZF := 1 +/// FOR i := 0 to 7 +/// __odata[i] := 0 +/// ENDFOR +/// ELSE +/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey) +/// IF Authentic == 0 +/// ZF := 1 +/// FOR i := 0 to 7 +/// __odata[i] := 0 +/// ENDFOR +/// ELSE +/// FOR i := 0 to 7 +/// __odata[i] := AES256Decrypt (__idata[i], UnwrappedKey) +/// ENDFOR +/// ZF := 0 +/// FI +/// FI +/// dst := ZF +/// OF := 0 +/// SF := 0 +/// AF := 0 +/// PF := 0 +/// CF := 0 +/// \endoperation +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_mm_aesdecwide256kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) { + return __builtin_ia32_aesdecwide256kl_u8((__v2di *)__odata, + (const __v2di *)__idata, __h); +} + +#undef __DEFAULT_FN_ATTRS + +#endif /* !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) \ + || defined(__WIDEKL__) */ + +#endif /* _KEYLOCKERINTRIN_H */ diff --git a/include-llvm/lwpintrin.h b/include-llvm/lwpintrin.h new file mode 100644 index 0000000..d8ab0db --- /dev/null +++ b/include-llvm/lwpintrin.h @@ -0,0 +1,136 @@ +/*===---- lwpintrin.h - LWP intrinsics -------------------------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __X86INTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __LWPINTRIN_H +#define __LWPINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("lwp"))) + +/// Parses the LWPCB at the specified address and enables +/// profiling if valid. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the LLWPCB instruction. +/// +/// \param __addr +/// Address to the new Lightweight Profiling Control Block (LWPCB). If the +/// LWPCB is valid, writes the address into the LWP_CBADDR MSR and enables +/// Lightweight Profiling. +static __inline__ void __DEFAULT_FN_ATTRS +__llwpcb (void *__addr) +{ + __builtin_ia32_llwpcb(__addr); +} + +/// Flushes the LWP state to memory and returns the address of the LWPCB. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the SLWPCB instruction. +/// +/// \return +/// Address to the current Lightweight Profiling Control Block (LWPCB). +/// If LWP is not currently enabled, returns NULL. +static __inline__ void* __DEFAULT_FN_ATTRS +__slwpcb (void) +{ + return __builtin_ia32_slwpcb(); +} + +/// Inserts programmed event record into the LWP event ring buffer +/// and advances the ring buffer pointer. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the LWPINS instruction. +/// +/// \param DATA2 +/// A 32-bit value is zero-extended and inserted into the 64-bit Data2 field. +/// \param DATA1 +/// A 32-bit value is inserted into the 32-bit Data1 field. +/// \param FLAGS +/// A 32-bit immediate value is inserted into the 32-bit Flags field. +/// \returns If the ring buffer is full and LWP is running in Synchronized Mode, +/// the event record overwrites the last record in the buffer, the MissedEvents +/// counter in the LWPCB is incremented, the head pointer is not advanced, and +/// 1 is returned. Otherwise 0 is returned. +#define __lwpins32(DATA2, DATA1, FLAGS) \ + (__builtin_ia32_lwpins32((unsigned int) (DATA2), (unsigned int) (DATA1), \ + (unsigned int) (FLAGS))) + +/// Decrements the LWP programmed value sample event counter. If the result is +/// negative, inserts an event record into the LWP event ring buffer in memory +/// and advances the ring buffer pointer. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the LWPVAL instruction. +/// +/// \param DATA2 +/// A 32-bit value is zero-extended and inserted into the 64-bit Data2 field. +/// \param DATA1 +/// A 32-bit value is inserted into the 32-bit Data1 field. +/// \param FLAGS +/// A 32-bit immediate value is inserted into the 32-bit Flags field. +#define __lwpval32(DATA2, DATA1, FLAGS) \ + (__builtin_ia32_lwpval32((unsigned int) (DATA2), (unsigned int) (DATA1), \ + (unsigned int) (FLAGS))) + +#ifdef __x86_64__ + +/// Inserts programmed event record into the LWP event ring buffer +/// and advances the ring buffer pointer. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the LWPINS instruction. +/// +/// \param DATA2 +/// A 64-bit value is inserted into the 64-bit Data2 field. +/// \param DATA1 +/// A 32-bit value is inserted into the 32-bit Data1 field. +/// \param FLAGS +/// A 32-bit immediate value is inserted into the 32-bit Flags field. +/// \returns If the ring buffer is full and LWP is running in Synchronized Mode, +/// the event record overwrites the last record in the buffer, the MissedEvents +/// counter in the LWPCB is incremented, the head pointer is not advanced, and +/// 1 is returned. Otherwise 0 is returned. +#define __lwpins64(DATA2, DATA1, FLAGS) \ + (__builtin_ia32_lwpins64((unsigned long long) (DATA2), (unsigned int) (DATA1), \ + (unsigned int) (FLAGS))) + +/// Decrements the LWP programmed value sample event counter. If the result is +/// negative, inserts an event record into the LWP event ring buffer in memory +/// and advances the ring buffer pointer. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the LWPVAL instruction. +/// +/// \param DATA2 +/// A 64-bit value is and inserted into the 64-bit Data2 field. +/// \param DATA1 +/// A 32-bit value is inserted into the 32-bit Data1 field. +/// \param FLAGS +/// A 32-bit immediate value is inserted into the 32-bit Flags field. +#define __lwpval64(DATA2, DATA1, FLAGS) \ + (__builtin_ia32_lwpval64((unsigned long long) (DATA2), (unsigned int) (DATA1), \ + (unsigned int) (FLAGS))) + +#endif + +#undef __DEFAULT_FN_ATTRS + +#endif /* __LWPINTRIN_H */ diff --git a/include-llvm/lzcntintrin.h b/include-llvm/lzcntintrin.h new file mode 100644 index 0000000..f4ddce9 --- /dev/null +++ b/include-llvm/lzcntintrin.h @@ -0,0 +1,104 @@ +/*===---- lzcntintrin.h - LZCNT intrinsics ---------------------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __LZCNTINTRIN_H +#define __LZCNTINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("lzcnt"))) + +#ifndef _MSC_VER +/// Counts the number of leading zero bits in the operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c LZCNT instruction. +/// +/// \param __X +/// An unsigned 16-bit integer whose leading zeros are to be counted. +/// \returns An unsigned 16-bit integer containing the number of leading zero +/// bits in the operand. +#define __lzcnt16(X) __builtin_ia32_lzcnt_u16((unsigned short)(X)) +#endif // _MSC_VER + +/// Counts the number of leading zero bits in the operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c LZCNT instruction. +/// +/// \param __X +/// An unsigned 32-bit integer whose leading zeros are to be counted. +/// \returns An unsigned 32-bit integer containing the number of leading zero +/// bits in the operand. +/// \see _lzcnt_u32 +static __inline__ unsigned int __DEFAULT_FN_ATTRS +__lzcnt32(unsigned int __X) +{ + return __builtin_ia32_lzcnt_u32(__X); +} + +/// Counts the number of leading zero bits in the operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c LZCNT instruction. +/// +/// \param __X +/// An unsigned 32-bit integer whose leading zeros are to be counted. +/// \returns An unsigned 32-bit integer containing the number of leading zero +/// bits in the operand. +/// \see __lzcnt32 +static __inline__ unsigned int __DEFAULT_FN_ATTRS +_lzcnt_u32(unsigned int __X) +{ + return __builtin_ia32_lzcnt_u32(__X); +} + +#ifdef __x86_64__ +#ifndef _MSC_VER +/// Counts the number of leading zero bits in the operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c LZCNT instruction. +/// +/// \param __X +/// An unsigned 64-bit integer whose leading zeros are to be counted. +/// \returns An unsigned 64-bit integer containing the number of leading zero +/// bits in the operand. +/// \see _lzcnt_u64 +#define __lzcnt64(X) __builtin_ia32_lzcnt_u64((unsigned long long)(X)) +#endif // _MSC_VER + +/// Counts the number of leading zero bits in the operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c LZCNT instruction. +/// +/// \param __X +/// An unsigned 64-bit integer whose leading zeros are to be counted. +/// \returns An unsigned 64-bit integer containing the number of leading zero +/// bits in the operand. +/// \see __lzcnt64 +static __inline__ unsigned long long __DEFAULT_FN_ATTRS +_lzcnt_u64(unsigned long long __X) +{ + return __builtin_ia32_lzcnt_u64(__X); +} +#endif + +#undef __DEFAULT_FN_ATTRS + +#endif /* __LZCNTINTRIN_H */ diff --git a/include-llvm/mm3dnow.h b/include-llvm/mm3dnow.h new file mode 100644 index 0000000..22ab13a --- /dev/null +++ b/include-llvm/mm3dnow.h @@ -0,0 +1,157 @@ +/*===---- mm3dnow.h - 3DNow! intrinsics ------------------------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef _MM3DNOW_H_INCLUDED +#define _MM3DNOW_H_INCLUDED + +#include +#include + +typedef float __v2sf __attribute__((__vector_size__(8))); + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("3dnow"), __min_vector_width__(64))) + +static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("3dnow"))) +_m_femms(void) { + __builtin_ia32_femms(); +} + +static __inline__ __m64 __DEFAULT_FN_ATTRS +_m_pavgusb(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_ia32_pavgusb((__v8qi)__m1, (__v8qi)__m2); +} + +static __inline__ __m64 __DEFAULT_FN_ATTRS +_m_pf2id(__m64 __m) { + return (__m64)__builtin_ia32_pf2id((__v2sf)__m); +} + +static __inline__ __m64 __DEFAULT_FN_ATTRS +_m_pfacc(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_ia32_pfacc((__v2sf)__m1, (__v2sf)__m2); +} + +static __inline__ __m64 __DEFAULT_FN_ATTRS +_m_pfadd(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_ia32_pfadd((__v2sf)__m1, (__v2sf)__m2); +} + +static __inline__ __m64 __DEFAULT_FN_ATTRS +_m_pfcmpeq(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_ia32_pfcmpeq((__v2sf)__m1, (__v2sf)__m2); +} + +static __inline__ __m64 __DEFAULT_FN_ATTRS +_m_pfcmpge(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_ia32_pfcmpge((__v2sf)__m1, (__v2sf)__m2); +} + +static __inline__ __m64 __DEFAULT_FN_ATTRS +_m_pfcmpgt(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_ia32_pfcmpgt((__v2sf)__m1, (__v2sf)__m2); +} + +static __inline__ __m64 __DEFAULT_FN_ATTRS +_m_pfmax(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_ia32_pfmax((__v2sf)__m1, (__v2sf)__m2); +} + +static __inline__ __m64 __DEFAULT_FN_ATTRS +_m_pfmin(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_ia32_pfmin((__v2sf)__m1, (__v2sf)__m2); +} + +static __inline__ __m64 __DEFAULT_FN_ATTRS +_m_pfmul(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_ia32_pfmul((__v2sf)__m1, (__v2sf)__m2); +} + +static __inline__ __m64 __DEFAULT_FN_ATTRS +_m_pfrcp(__m64 __m) { + return (__m64)__builtin_ia32_pfrcp((__v2sf)__m); +} + +static __inline__ __m64 __DEFAULT_FN_ATTRS +_m_pfrcpit1(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_ia32_pfrcpit1((__v2sf)__m1, (__v2sf)__m2); +} + +static __inline__ __m64 __DEFAULT_FN_ATTRS +_m_pfrcpit2(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_ia32_pfrcpit2((__v2sf)__m1, (__v2sf)__m2); +} + +static __inline__ __m64 __DEFAULT_FN_ATTRS +_m_pfrsqrt(__m64 __m) { + return (__m64)__builtin_ia32_pfrsqrt((__v2sf)__m); +} + +static __inline__ __m64 __DEFAULT_FN_ATTRS +_m_pfrsqrtit1(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_ia32_pfrsqit1((__v2sf)__m1, (__v2sf)__m2); +} + +static __inline__ __m64 __DEFAULT_FN_ATTRS +_m_pfsub(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_ia32_pfsub((__v2sf)__m1, (__v2sf)__m2); +} + +static __inline__ __m64 __DEFAULT_FN_ATTRS +_m_pfsubr(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_ia32_pfsubr((__v2sf)__m1, (__v2sf)__m2); +} + +static __inline__ __m64 __DEFAULT_FN_ATTRS +_m_pi2fd(__m64 __m) { + return (__m64)__builtin_ia32_pi2fd((__v2si)__m); +} + +static __inline__ __m64 __DEFAULT_FN_ATTRS +_m_pmulhrw(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_ia32_pmulhrw((__v4hi)__m1, (__v4hi)__m2); +} + +/* Handle the 3dnowa instructions here. */ +#undef __DEFAULT_FN_ATTRS +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("3dnowa"), __min_vector_width__(64))) + +static __inline__ __m64 __DEFAULT_FN_ATTRS +_m_pf2iw(__m64 __m) { + return (__m64)__builtin_ia32_pf2iw((__v2sf)__m); +} + +static __inline__ __m64 __DEFAULT_FN_ATTRS +_m_pfnacc(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_ia32_pfnacc((__v2sf)__m1, (__v2sf)__m2); +} + +static __inline__ __m64 __DEFAULT_FN_ATTRS +_m_pfpnacc(__m64 __m1, __m64 __m2) { + return (__m64)__builtin_ia32_pfpnacc((__v2sf)__m1, (__v2sf)__m2); +} + +static __inline__ __m64 __DEFAULT_FN_ATTRS +_m_pi2fw(__m64 __m) { + return (__m64)__builtin_ia32_pi2fw((__v2si)__m); +} + +static __inline__ __m64 __DEFAULT_FN_ATTRS +_m_pswapdsf(__m64 __m) { + return (__m64)__builtin_ia32_pswapdsf((__v2sf)__m); +} + +static __inline__ __m64 __DEFAULT_FN_ATTRS +_m_pswapdsi(__m64 __m) { + return (__m64)__builtin_ia32_pswapdsi((__v2si)__m); +} + +#undef __DEFAULT_FN_ATTRS + +#endif diff --git a/include-llvm/mm_malloc.h b/include-llvm/mm_malloc.h new file mode 100644 index 0000000..933dbaa --- /dev/null +++ b/include-llvm/mm_malloc.h @@ -0,0 +1,67 @@ +/*===---- mm_malloc.h - Allocating and Freeing Aligned Memory Blocks -------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __MM_MALLOC_H +#define __MM_MALLOC_H + +#include + +#ifdef _WIN32 +#include +#else +#ifndef __cplusplus +extern int posix_memalign(void **__memptr, size_t __alignment, size_t __size); +#else +// Some systems (e.g. those with GNU libc) declare posix_memalign with an +// exception specifier. Via an "egregious workaround" in +// Sema::CheckEquivalentExceptionSpec, Clang accepts the following as a valid +// redeclaration of glibc's declaration. +extern "C" int posix_memalign(void **__memptr, size_t __alignment, size_t __size); +#endif +#endif + +#if !(defined(_WIN32) && defined(_mm_malloc)) +static __inline__ void *__attribute__((__always_inline__, __nodebug__, + __malloc__)) +_mm_malloc(size_t __size, size_t __align) +{ + if (__align == 1) { + return malloc(__size); + } + + if (!(__align & (__align - 1)) && __align < sizeof(void *)) + __align = sizeof(void *); + + void *__mallocedMemory; +#if defined(__MINGW32__) + __mallocedMemory = __mingw_aligned_malloc(__size, __align); +#elif defined(_WIN32) + __mallocedMemory = _aligned_malloc(__size, __align); +#else + if (posix_memalign(&__mallocedMemory, __align, __size)) + return 0; +#endif + + return __mallocedMemory; +} + +static __inline__ void __attribute__((__always_inline__, __nodebug__)) +_mm_free(void *__p) +{ +#if defined(__MINGW32__) + __mingw_aligned_free(__p); +#elif defined(_WIN32) + _aligned_free(__p); +#else + free(__p); +#endif +} +#endif + +#endif /* __MM_MALLOC_H */ diff --git a/include-llvm/mmintrin.h b/include-llvm/mmintrin.h new file mode 100644 index 0000000..03bac92 --- /dev/null +++ b/include-llvm/mmintrin.h @@ -0,0 +1,1562 @@ +/*===---- mmintrin.h - MMX intrinsics --------------------------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __MMINTRIN_H +#define __MMINTRIN_H + +#if !defined(__i386__) && !defined(__x86_64__) +#error "This header is only meant to be used on x86 and x64 architecture" +#endif + +typedef long long __m64 __attribute__((__vector_size__(8), __aligned__(8))); + +typedef long long __v1di __attribute__((__vector_size__(8))); +typedef int __v2si __attribute__((__vector_size__(8))); +typedef short __v4hi __attribute__((__vector_size__(8))); +typedef char __v8qi __attribute__((__vector_size__(8))); + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("mmx"), __min_vector_width__(64))) + +/// Clears the MMX state by setting the state of the x87 stack registers +/// to empty. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the EMMS instruction. +/// +static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("mmx"))) +_mm_empty(void) +{ + __builtin_ia32_emms(); +} + +/// Constructs a 64-bit integer vector, setting the lower 32 bits to the +/// value of the 32-bit integer parameter and setting the upper 32 bits to 0. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the MOVD instruction. +/// +/// \param __i +/// A 32-bit integer value. +/// \returns A 64-bit integer vector. The lower 32 bits contain the value of the +/// parameter. The upper 32 bits are set to 0. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_cvtsi32_si64(int __i) +{ + return (__m64)__builtin_ia32_vec_init_v2si(__i, 0); +} + +/// Returns the lower 32 bits of a 64-bit integer vector as a 32-bit +/// signed integer. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the MOVD instruction. +/// +/// \param __m +/// A 64-bit integer vector. +/// \returns A 32-bit signed integer value containing the lower 32 bits of the +/// parameter. +static __inline__ int __DEFAULT_FN_ATTRS +_mm_cvtsi64_si32(__m64 __m) +{ + return __builtin_ia32_vec_ext_v2si((__v2si)__m, 0); +} + +/// Casts a 64-bit signed integer value into a 64-bit integer vector. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the MOVQ instruction. +/// +/// \param __i +/// A 64-bit signed integer. +/// \returns A 64-bit integer vector containing the same bitwise pattern as the +/// parameter. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_cvtsi64_m64(long long __i) +{ + return (__m64)__i; +} + +/// Casts a 64-bit integer vector into a 64-bit signed integer value. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the MOVQ instruction. +/// +/// \param __m +/// A 64-bit integer vector. +/// \returns A 64-bit signed integer containing the same bitwise pattern as the +/// parameter. +static __inline__ long long __DEFAULT_FN_ATTRS +_mm_cvtm64_si64(__m64 __m) +{ + return (long long)__m; +} + +/// Converts 16-bit signed integers from both 64-bit integer vector +/// parameters of [4 x i16] into 8-bit signed integer values, and constructs +/// a 64-bit integer vector of [8 x i8] as the result. Positive values +/// greater than 0x7F are saturated to 0x7F. Negative values less than 0x80 +/// are saturated to 0x80. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PACKSSWB instruction. +/// +/// \param __m1 +/// A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a +/// 16-bit signed integer and is converted to an 8-bit signed integer with +/// saturation. Positive values greater than 0x7F are saturated to 0x7F. +/// Negative values less than 0x80 are saturated to 0x80. The converted +/// [4 x i8] values are written to the lower 32 bits of the result. +/// \param __m2 +/// A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a +/// 16-bit signed integer and is converted to an 8-bit signed integer with +/// saturation. Positive values greater than 0x7F are saturated to 0x7F. +/// Negative values less than 0x80 are saturated to 0x80. The converted +/// [4 x i8] values are written to the upper 32 bits of the result. +/// \returns A 64-bit integer vector of [8 x i8] containing the converted +/// values. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_packs_pi16(__m64 __m1, __m64 __m2) +{ + return (__m64)__builtin_ia32_packsswb((__v4hi)__m1, (__v4hi)__m2); +} + +/// Converts 32-bit signed integers from both 64-bit integer vector +/// parameters of [2 x i32] into 16-bit signed integer values, and constructs +/// a 64-bit integer vector of [4 x i16] as the result. Positive values +/// greater than 0x7FFF are saturated to 0x7FFF. Negative values less than +/// 0x8000 are saturated to 0x8000. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PACKSSDW instruction. +/// +/// \param __m1 +/// A 64-bit integer vector of [2 x i32]. Each 32-bit element is treated as a +/// 32-bit signed integer and is converted to a 16-bit signed integer with +/// saturation. Positive values greater than 0x7FFF are saturated to 0x7FFF. +/// Negative values less than 0x8000 are saturated to 0x8000. The converted +/// [2 x i16] values are written to the lower 32 bits of the result. +/// \param __m2 +/// A 64-bit integer vector of [2 x i32]. Each 32-bit element is treated as a +/// 32-bit signed integer and is converted to a 16-bit signed integer with +/// saturation. Positive values greater than 0x7FFF are saturated to 0x7FFF. +/// Negative values less than 0x8000 are saturated to 0x8000. The converted +/// [2 x i16] values are written to the upper 32 bits of the result. +/// \returns A 64-bit integer vector of [4 x i16] containing the converted +/// values. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_packs_pi32(__m64 __m1, __m64 __m2) +{ + return (__m64)__builtin_ia32_packssdw((__v2si)__m1, (__v2si)__m2); +} + +/// Converts 16-bit signed integers from both 64-bit integer vector +/// parameters of [4 x i16] into 8-bit unsigned integer values, and +/// constructs a 64-bit integer vector of [8 x i8] as the result. Values +/// greater than 0xFF are saturated to 0xFF. Values less than 0 are saturated +/// to 0. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PACKUSWB instruction. +/// +/// \param __m1 +/// A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a +/// 16-bit signed integer and is converted to an 8-bit unsigned integer with +/// saturation. Values greater than 0xFF are saturated to 0xFF. Values less +/// than 0 are saturated to 0. The converted [4 x i8] values are written to +/// the lower 32 bits of the result. +/// \param __m2 +/// A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a +/// 16-bit signed integer and is converted to an 8-bit unsigned integer with +/// saturation. Values greater than 0xFF are saturated to 0xFF. Values less +/// than 0 are saturated to 0. The converted [4 x i8] values are written to +/// the upper 32 bits of the result. +/// \returns A 64-bit integer vector of [8 x i8] containing the converted +/// values. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_packs_pu16(__m64 __m1, __m64 __m2) +{ + return (__m64)__builtin_ia32_packuswb((__v4hi)__m1, (__v4hi)__m2); +} + +/// Unpacks the upper 32 bits from two 64-bit integer vectors of [8 x i8] +/// and interleaves them into a 64-bit integer vector of [8 x i8]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PUNPCKHBW instruction. +/// +/// \param __m1 +/// A 64-bit integer vector of [8 x i8]. \n +/// Bits [39:32] are written to bits [7:0] of the result. \n +/// Bits [47:40] are written to bits [23:16] of the result. \n +/// Bits [55:48] are written to bits [39:32] of the result. \n +/// Bits [63:56] are written to bits [55:48] of the result. +/// \param __m2 +/// A 64-bit integer vector of [8 x i8]. +/// Bits [39:32] are written to bits [15:8] of the result. \n +/// Bits [47:40] are written to bits [31:24] of the result. \n +/// Bits [55:48] are written to bits [47:40] of the result. \n +/// Bits [63:56] are written to bits [63:56] of the result. +/// \returns A 64-bit integer vector of [8 x i8] containing the interleaved +/// values. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_unpackhi_pi8(__m64 __m1, __m64 __m2) +{ + return (__m64)__builtin_ia32_punpckhbw((__v8qi)__m1, (__v8qi)__m2); +} + +/// Unpacks the upper 32 bits from two 64-bit integer vectors of +/// [4 x i16] and interleaves them into a 64-bit integer vector of [4 x i16]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PUNPCKHWD instruction. +/// +/// \param __m1 +/// A 64-bit integer vector of [4 x i16]. +/// Bits [47:32] are written to bits [15:0] of the result. \n +/// Bits [63:48] are written to bits [47:32] of the result. +/// \param __m2 +/// A 64-bit integer vector of [4 x i16]. +/// Bits [47:32] are written to bits [31:16] of the result. \n +/// Bits [63:48] are written to bits [63:48] of the result. +/// \returns A 64-bit integer vector of [4 x i16] containing the interleaved +/// values. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_unpackhi_pi16(__m64 __m1, __m64 __m2) +{ + return (__m64)__builtin_ia32_punpckhwd((__v4hi)__m1, (__v4hi)__m2); +} + +/// Unpacks the upper 32 bits from two 64-bit integer vectors of +/// [2 x i32] and interleaves them into a 64-bit integer vector of [2 x i32]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PUNPCKHDQ instruction. +/// +/// \param __m1 +/// A 64-bit integer vector of [2 x i32]. The upper 32 bits are written to +/// the lower 32 bits of the result. +/// \param __m2 +/// A 64-bit integer vector of [2 x i32]. The upper 32 bits are written to +/// the upper 32 bits of the result. +/// \returns A 64-bit integer vector of [2 x i32] containing the interleaved +/// values. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_unpackhi_pi32(__m64 __m1, __m64 __m2) +{ + return (__m64)__builtin_ia32_punpckhdq((__v2si)__m1, (__v2si)__m2); +} + +/// Unpacks the lower 32 bits from two 64-bit integer vectors of [8 x i8] +/// and interleaves them into a 64-bit integer vector of [8 x i8]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PUNPCKLBW instruction. +/// +/// \param __m1 +/// A 64-bit integer vector of [8 x i8]. +/// Bits [7:0] are written to bits [7:0] of the result. \n +/// Bits [15:8] are written to bits [23:16] of the result. \n +/// Bits [23:16] are written to bits [39:32] of the result. \n +/// Bits [31:24] are written to bits [55:48] of the result. +/// \param __m2 +/// A 64-bit integer vector of [8 x i8]. +/// Bits [7:0] are written to bits [15:8] of the result. \n +/// Bits [15:8] are written to bits [31:24] of the result. \n +/// Bits [23:16] are written to bits [47:40] of the result. \n +/// Bits [31:24] are written to bits [63:56] of the result. +/// \returns A 64-bit integer vector of [8 x i8] containing the interleaved +/// values. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_unpacklo_pi8(__m64 __m1, __m64 __m2) +{ + return (__m64)__builtin_ia32_punpcklbw((__v8qi)__m1, (__v8qi)__m2); +} + +/// Unpacks the lower 32 bits from two 64-bit integer vectors of +/// [4 x i16] and interleaves them into a 64-bit integer vector of [4 x i16]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PUNPCKLWD instruction. +/// +/// \param __m1 +/// A 64-bit integer vector of [4 x i16]. +/// Bits [15:0] are written to bits [15:0] of the result. \n +/// Bits [31:16] are written to bits [47:32] of the result. +/// \param __m2 +/// A 64-bit integer vector of [4 x i16]. +/// Bits [15:0] are written to bits [31:16] of the result. \n +/// Bits [31:16] are written to bits [63:48] of the result. +/// \returns A 64-bit integer vector of [4 x i16] containing the interleaved +/// values. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_unpacklo_pi16(__m64 __m1, __m64 __m2) +{ + return (__m64)__builtin_ia32_punpcklwd((__v4hi)__m1, (__v4hi)__m2); +} + +/// Unpacks the lower 32 bits from two 64-bit integer vectors of +/// [2 x i32] and interleaves them into a 64-bit integer vector of [2 x i32]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PUNPCKLDQ instruction. +/// +/// \param __m1 +/// A 64-bit integer vector of [2 x i32]. The lower 32 bits are written to +/// the lower 32 bits of the result. +/// \param __m2 +/// A 64-bit integer vector of [2 x i32]. The lower 32 bits are written to +/// the upper 32 bits of the result. +/// \returns A 64-bit integer vector of [2 x i32] containing the interleaved +/// values. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_unpacklo_pi32(__m64 __m1, __m64 __m2) +{ + return (__m64)__builtin_ia32_punpckldq((__v2si)__m1, (__v2si)__m2); +} + +/// Adds each 8-bit integer element of the first 64-bit integer vector +/// of [8 x i8] to the corresponding 8-bit integer element of the second +/// 64-bit integer vector of [8 x i8]. The lower 8 bits of the results are +/// packed into a 64-bit integer vector of [8 x i8]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PADDB instruction. +/// +/// \param __m1 +/// A 64-bit integer vector of [8 x i8]. +/// \param __m2 +/// A 64-bit integer vector of [8 x i8]. +/// \returns A 64-bit integer vector of [8 x i8] containing the sums of both +/// parameters. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_add_pi8(__m64 __m1, __m64 __m2) +{ + return (__m64)__builtin_ia32_paddb((__v8qi)__m1, (__v8qi)__m2); +} + +/// Adds each 16-bit integer element of the first 64-bit integer vector +/// of [4 x i16] to the corresponding 16-bit integer element of the second +/// 64-bit integer vector of [4 x i16]. The lower 16 bits of the results are +/// packed into a 64-bit integer vector of [4 x i16]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PADDW instruction. +/// +/// \param __m1 +/// A 64-bit integer vector of [4 x i16]. +/// \param __m2 +/// A 64-bit integer vector of [4 x i16]. +/// \returns A 64-bit integer vector of [4 x i16] containing the sums of both +/// parameters. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_add_pi16(__m64 __m1, __m64 __m2) +{ + return (__m64)__builtin_ia32_paddw((__v4hi)__m1, (__v4hi)__m2); +} + +/// Adds each 32-bit integer element of the first 64-bit integer vector +/// of [2 x i32] to the corresponding 32-bit integer element of the second +/// 64-bit integer vector of [2 x i32]. The lower 32 bits of the results are +/// packed into a 64-bit integer vector of [2 x i32]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PADDD instruction. +/// +/// \param __m1 +/// A 64-bit integer vector of [2 x i32]. +/// \param __m2 +/// A 64-bit integer vector of [2 x i32]. +/// \returns A 64-bit integer vector of [2 x i32] containing the sums of both +/// parameters. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_add_pi32(__m64 __m1, __m64 __m2) +{ + return (__m64)__builtin_ia32_paddd((__v2si)__m1, (__v2si)__m2); +} + +/// Adds each 8-bit signed integer element of the first 64-bit integer +/// vector of [8 x i8] to the corresponding 8-bit signed integer element of +/// the second 64-bit integer vector of [8 x i8]. Positive sums greater than +/// 0x7F are saturated to 0x7F. Negative sums less than 0x80 are saturated to +/// 0x80. The results are packed into a 64-bit integer vector of [8 x i8]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PADDSB instruction. +/// +/// \param __m1 +/// A 64-bit integer vector of [8 x i8]. +/// \param __m2 +/// A 64-bit integer vector of [8 x i8]. +/// \returns A 64-bit integer vector of [8 x i8] containing the saturated sums +/// of both parameters. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_adds_pi8(__m64 __m1, __m64 __m2) +{ + return (__m64)__builtin_ia32_paddsb((__v8qi)__m1, (__v8qi)__m2); +} + +/// Adds each 16-bit signed integer element of the first 64-bit integer +/// vector of [4 x i16] to the corresponding 16-bit signed integer element of +/// the second 64-bit integer vector of [4 x i16]. Positive sums greater than +/// 0x7FFF are saturated to 0x7FFF. Negative sums less than 0x8000 are +/// saturated to 0x8000. The results are packed into a 64-bit integer vector +/// of [4 x i16]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PADDSW instruction. +/// +/// \param __m1 +/// A 64-bit integer vector of [4 x i16]. +/// \param __m2 +/// A 64-bit integer vector of [4 x i16]. +/// \returns A 64-bit integer vector of [4 x i16] containing the saturated sums +/// of both parameters. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_adds_pi16(__m64 __m1, __m64 __m2) +{ + return (__m64)__builtin_ia32_paddsw((__v4hi)__m1, (__v4hi)__m2); +} + +/// Adds each 8-bit unsigned integer element of the first 64-bit integer +/// vector of [8 x i8] to the corresponding 8-bit unsigned integer element of +/// the second 64-bit integer vector of [8 x i8]. Sums greater than 0xFF are +/// saturated to 0xFF. The results are packed into a 64-bit integer vector of +/// [8 x i8]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PADDUSB instruction. +/// +/// \param __m1 +/// A 64-bit integer vector of [8 x i8]. +/// \param __m2 +/// A 64-bit integer vector of [8 x i8]. +/// \returns A 64-bit integer vector of [8 x i8] containing the saturated +/// unsigned sums of both parameters. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_adds_pu8(__m64 __m1, __m64 __m2) +{ + return (__m64)__builtin_ia32_paddusb((__v8qi)__m1, (__v8qi)__m2); +} + +/// Adds each 16-bit unsigned integer element of the first 64-bit integer +/// vector of [4 x i16] to the corresponding 16-bit unsigned integer element +/// of the second 64-bit integer vector of [4 x i16]. Sums greater than +/// 0xFFFF are saturated to 0xFFFF. The results are packed into a 64-bit +/// integer vector of [4 x i16]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PADDUSW instruction. +/// +/// \param __m1 +/// A 64-bit integer vector of [4 x i16]. +/// \param __m2 +/// A 64-bit integer vector of [4 x i16]. +/// \returns A 64-bit integer vector of [4 x i16] containing the saturated +/// unsigned sums of both parameters. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_adds_pu16(__m64 __m1, __m64 __m2) +{ + return (__m64)__builtin_ia32_paddusw((__v4hi)__m1, (__v4hi)__m2); +} + +/// Subtracts each 8-bit integer element of the second 64-bit integer +/// vector of [8 x i8] from the corresponding 8-bit integer element of the +/// first 64-bit integer vector of [8 x i8]. The lower 8 bits of the results +/// are packed into a 64-bit integer vector of [8 x i8]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PSUBB instruction. +/// +/// \param __m1 +/// A 64-bit integer vector of [8 x i8] containing the minuends. +/// \param __m2 +/// A 64-bit integer vector of [8 x i8] containing the subtrahends. +/// \returns A 64-bit integer vector of [8 x i8] containing the differences of +/// both parameters. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_sub_pi8(__m64 __m1, __m64 __m2) +{ + return (__m64)__builtin_ia32_psubb((__v8qi)__m1, (__v8qi)__m2); +} + +/// Subtracts each 16-bit integer element of the second 64-bit integer +/// vector of [4 x i16] from the corresponding 16-bit integer element of the +/// first 64-bit integer vector of [4 x i16]. The lower 16 bits of the +/// results are packed into a 64-bit integer vector of [4 x i16]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PSUBW instruction. +/// +/// \param __m1 +/// A 64-bit integer vector of [4 x i16] containing the minuends. +/// \param __m2 +/// A 64-bit integer vector of [4 x i16] containing the subtrahends. +/// \returns A 64-bit integer vector of [4 x i16] containing the differences of +/// both parameters. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_sub_pi16(__m64 __m1, __m64 __m2) +{ + return (__m64)__builtin_ia32_psubw((__v4hi)__m1, (__v4hi)__m2); +} + +/// Subtracts each 32-bit integer element of the second 64-bit integer +/// vector of [2 x i32] from the corresponding 32-bit integer element of the +/// first 64-bit integer vector of [2 x i32]. The lower 32 bits of the +/// results are packed into a 64-bit integer vector of [2 x i32]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PSUBD instruction. +/// +/// \param __m1 +/// A 64-bit integer vector of [2 x i32] containing the minuends. +/// \param __m2 +/// A 64-bit integer vector of [2 x i32] containing the subtrahends. +/// \returns A 64-bit integer vector of [2 x i32] containing the differences of +/// both parameters. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_sub_pi32(__m64 __m1, __m64 __m2) +{ + return (__m64)__builtin_ia32_psubd((__v2si)__m1, (__v2si)__m2); +} + +/// Subtracts each 8-bit signed integer element of the second 64-bit +/// integer vector of [8 x i8] from the corresponding 8-bit signed integer +/// element of the first 64-bit integer vector of [8 x i8]. Positive results +/// greater than 0x7F are saturated to 0x7F. Negative results less than 0x80 +/// are saturated to 0x80. The results are packed into a 64-bit integer +/// vector of [8 x i8]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PSUBSB instruction. +/// +/// \param __m1 +/// A 64-bit integer vector of [8 x i8] containing the minuends. +/// \param __m2 +/// A 64-bit integer vector of [8 x i8] containing the subtrahends. +/// \returns A 64-bit integer vector of [8 x i8] containing the saturated +/// differences of both parameters. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_subs_pi8(__m64 __m1, __m64 __m2) +{ + return (__m64)__builtin_ia32_psubsb((__v8qi)__m1, (__v8qi)__m2); +} + +/// Subtracts each 16-bit signed integer element of the second 64-bit +/// integer vector of [4 x i16] from the corresponding 16-bit signed integer +/// element of the first 64-bit integer vector of [4 x i16]. Positive results +/// greater than 0x7FFF are saturated to 0x7FFF. Negative results less than +/// 0x8000 are saturated to 0x8000. The results are packed into a 64-bit +/// integer vector of [4 x i16]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PSUBSW instruction. +/// +/// \param __m1 +/// A 64-bit integer vector of [4 x i16] containing the minuends. +/// \param __m2 +/// A 64-bit integer vector of [4 x i16] containing the subtrahends. +/// \returns A 64-bit integer vector of [4 x i16] containing the saturated +/// differences of both parameters. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_subs_pi16(__m64 __m1, __m64 __m2) +{ + return (__m64)__builtin_ia32_psubsw((__v4hi)__m1, (__v4hi)__m2); +} + +/// Subtracts each 8-bit unsigned integer element of the second 64-bit +/// integer vector of [8 x i8] from the corresponding 8-bit unsigned integer +/// element of the first 64-bit integer vector of [8 x i8]. +/// +/// If an element of the first vector is less than the corresponding element +/// of the second vector, the result is saturated to 0. The results are +/// packed into a 64-bit integer vector of [8 x i8]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PSUBUSB instruction. +/// +/// \param __m1 +/// A 64-bit integer vector of [8 x i8] containing the minuends. +/// \param __m2 +/// A 64-bit integer vector of [8 x i8] containing the subtrahends. +/// \returns A 64-bit integer vector of [8 x i8] containing the saturated +/// differences of both parameters. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_subs_pu8(__m64 __m1, __m64 __m2) +{ + return (__m64)__builtin_ia32_psubusb((__v8qi)__m1, (__v8qi)__m2); +} + +/// Subtracts each 16-bit unsigned integer element of the second 64-bit +/// integer vector of [4 x i16] from the corresponding 16-bit unsigned +/// integer element of the first 64-bit integer vector of [4 x i16]. +/// +/// If an element of the first vector is less than the corresponding element +/// of the second vector, the result is saturated to 0. The results are +/// packed into a 64-bit integer vector of [4 x i16]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PSUBUSW instruction. +/// +/// \param __m1 +/// A 64-bit integer vector of [4 x i16] containing the minuends. +/// \param __m2 +/// A 64-bit integer vector of [4 x i16] containing the subtrahends. +/// \returns A 64-bit integer vector of [4 x i16] containing the saturated +/// differences of both parameters. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_subs_pu16(__m64 __m1, __m64 __m2) +{ + return (__m64)__builtin_ia32_psubusw((__v4hi)__m1, (__v4hi)__m2); +} + +/// Multiplies each 16-bit signed integer element of the first 64-bit +/// integer vector of [4 x i16] by the corresponding 16-bit signed integer +/// element of the second 64-bit integer vector of [4 x i16] and get four +/// 32-bit products. Adds adjacent pairs of products to get two 32-bit sums. +/// The lower 32 bits of these two sums are packed into a 64-bit integer +/// vector of [2 x i32]. +/// +/// For example, bits [15:0] of both parameters are multiplied, bits [31:16] +/// of both parameters are multiplied, and the sum of both results is written +/// to bits [31:0] of the result. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PMADDWD instruction. +/// +/// \param __m1 +/// A 64-bit integer vector of [4 x i16]. +/// \param __m2 +/// A 64-bit integer vector of [4 x i16]. +/// \returns A 64-bit integer vector of [2 x i32] containing the sums of +/// products of both parameters. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_madd_pi16(__m64 __m1, __m64 __m2) +{ + return (__m64)__builtin_ia32_pmaddwd((__v4hi)__m1, (__v4hi)__m2); +} + +/// Multiplies each 16-bit signed integer element of the first 64-bit +/// integer vector of [4 x i16] by the corresponding 16-bit signed integer +/// element of the second 64-bit integer vector of [4 x i16]. Packs the upper +/// 16 bits of the 32-bit products into a 64-bit integer vector of [4 x i16]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PMULHW instruction. +/// +/// \param __m1 +/// A 64-bit integer vector of [4 x i16]. +/// \param __m2 +/// A 64-bit integer vector of [4 x i16]. +/// \returns A 64-bit integer vector of [4 x i16] containing the upper 16 bits +/// of the products of both parameters. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_mulhi_pi16(__m64 __m1, __m64 __m2) +{ + return (__m64)__builtin_ia32_pmulhw((__v4hi)__m1, (__v4hi)__m2); +} + +/// Multiplies each 16-bit signed integer element of the first 64-bit +/// integer vector of [4 x i16] by the corresponding 16-bit signed integer +/// element of the second 64-bit integer vector of [4 x i16]. Packs the lower +/// 16 bits of the 32-bit products into a 64-bit integer vector of [4 x i16]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PMULLW instruction. +/// +/// \param __m1 +/// A 64-bit integer vector of [4 x i16]. +/// \param __m2 +/// A 64-bit integer vector of [4 x i16]. +/// \returns A 64-bit integer vector of [4 x i16] containing the lower 16 bits +/// of the products of both parameters. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_mullo_pi16(__m64 __m1, __m64 __m2) +{ + return (__m64)__builtin_ia32_pmullw((__v4hi)__m1, (__v4hi)__m2); +} + +/// Left-shifts each 16-bit signed integer element of the first +/// parameter, which is a 64-bit integer vector of [4 x i16], by the number +/// of bits specified by the second parameter, which is a 64-bit integer. The +/// lower 16 bits of the results are packed into a 64-bit integer vector of +/// [4 x i16]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PSLLW instruction. +/// +/// \param __m +/// A 64-bit integer vector of [4 x i16]. +/// \param __count +/// A 64-bit integer vector interpreted as a single 64-bit integer. +/// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted +/// values. If \a __count is greater or equal to 16, the result is set to all +/// 0. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_sll_pi16(__m64 __m, __m64 __count) +{ + return (__m64)__builtin_ia32_psllw((__v4hi)__m, __count); +} + +/// Left-shifts each 16-bit signed integer element of a 64-bit integer +/// vector of [4 x i16] by the number of bits specified by a 32-bit integer. +/// The lower 16 bits of the results are packed into a 64-bit integer vector +/// of [4 x i16]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PSLLW instruction. +/// +/// \param __m +/// A 64-bit integer vector of [4 x i16]. +/// \param __count +/// A 32-bit integer value. +/// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted +/// values. If \a __count is greater or equal to 16, the result is set to all +/// 0. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_slli_pi16(__m64 __m, int __count) +{ + return (__m64)__builtin_ia32_psllwi((__v4hi)__m, __count); +} + +/// Left-shifts each 32-bit signed integer element of the first +/// parameter, which is a 64-bit integer vector of [2 x i32], by the number +/// of bits specified by the second parameter, which is a 64-bit integer. The +/// lower 32 bits of the results are packed into a 64-bit integer vector of +/// [2 x i32]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PSLLD instruction. +/// +/// \param __m +/// A 64-bit integer vector of [2 x i32]. +/// \param __count +/// A 64-bit integer vector interpreted as a single 64-bit integer. +/// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted +/// values. If \a __count is greater or equal to 32, the result is set to all +/// 0. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_sll_pi32(__m64 __m, __m64 __count) +{ + return (__m64)__builtin_ia32_pslld((__v2si)__m, __count); +} + +/// Left-shifts each 32-bit signed integer element of a 64-bit integer +/// vector of [2 x i32] by the number of bits specified by a 32-bit integer. +/// The lower 32 bits of the results are packed into a 64-bit integer vector +/// of [2 x i32]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PSLLD instruction. +/// +/// \param __m +/// A 64-bit integer vector of [2 x i32]. +/// \param __count +/// A 32-bit integer value. +/// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted +/// values. If \a __count is greater or equal to 32, the result is set to all +/// 0. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_slli_pi32(__m64 __m, int __count) +{ + return (__m64)__builtin_ia32_pslldi((__v2si)__m, __count); +} + +/// Left-shifts the first 64-bit integer parameter by the number of bits +/// specified by the second 64-bit integer parameter. The lower 64 bits of +/// result are returned. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PSLLQ instruction. +/// +/// \param __m +/// A 64-bit integer vector interpreted as a single 64-bit integer. +/// \param __count +/// A 64-bit integer vector interpreted as a single 64-bit integer. +/// \returns A 64-bit integer vector containing the left-shifted value. If +/// \a __count is greater or equal to 64, the result is set to 0. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_sll_si64(__m64 __m, __m64 __count) +{ + return (__m64)__builtin_ia32_psllq((__v1di)__m, __count); +} + +/// Left-shifts the first parameter, which is a 64-bit integer, by the +/// number of bits specified by the second parameter, which is a 32-bit +/// integer. The lower 64 bits of result are returned. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PSLLQ instruction. +/// +/// \param __m +/// A 64-bit integer vector interpreted as a single 64-bit integer. +/// \param __count +/// A 32-bit integer value. +/// \returns A 64-bit integer vector containing the left-shifted value. If +/// \a __count is greater or equal to 64, the result is set to 0. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_slli_si64(__m64 __m, int __count) +{ + return (__m64)__builtin_ia32_psllqi((__v1di)__m, __count); +} + +/// Right-shifts each 16-bit integer element of the first parameter, +/// which is a 64-bit integer vector of [4 x i16], by the number of bits +/// specified by the second parameter, which is a 64-bit integer. +/// +/// High-order bits are filled with the sign bit of the initial value of each +/// 16-bit element. The 16-bit results are packed into a 64-bit integer +/// vector of [4 x i16]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PSRAW instruction. +/// +/// \param __m +/// A 64-bit integer vector of [4 x i16]. +/// \param __count +/// A 64-bit integer vector interpreted as a single 64-bit integer. +/// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted +/// values. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_sra_pi16(__m64 __m, __m64 __count) +{ + return (__m64)__builtin_ia32_psraw((__v4hi)__m, __count); +} + +/// Right-shifts each 16-bit integer element of a 64-bit integer vector +/// of [4 x i16] by the number of bits specified by a 32-bit integer. +/// +/// High-order bits are filled with the sign bit of the initial value of each +/// 16-bit element. The 16-bit results are packed into a 64-bit integer +/// vector of [4 x i16]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PSRAW instruction. +/// +/// \param __m +/// A 64-bit integer vector of [4 x i16]. +/// \param __count +/// A 32-bit integer value. +/// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted +/// values. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_srai_pi16(__m64 __m, int __count) +{ + return (__m64)__builtin_ia32_psrawi((__v4hi)__m, __count); +} + +/// Right-shifts each 32-bit integer element of the first parameter, +/// which is a 64-bit integer vector of [2 x i32], by the number of bits +/// specified by the second parameter, which is a 64-bit integer. +/// +/// High-order bits are filled with the sign bit of the initial value of each +/// 32-bit element. The 32-bit results are packed into a 64-bit integer +/// vector of [2 x i32]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PSRAD instruction. +/// +/// \param __m +/// A 64-bit integer vector of [2 x i32]. +/// \param __count +/// A 64-bit integer vector interpreted as a single 64-bit integer. +/// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted +/// values. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_sra_pi32(__m64 __m, __m64 __count) +{ + return (__m64)__builtin_ia32_psrad((__v2si)__m, __count); +} + +/// Right-shifts each 32-bit integer element of a 64-bit integer vector +/// of [2 x i32] by the number of bits specified by a 32-bit integer. +/// +/// High-order bits are filled with the sign bit of the initial value of each +/// 32-bit element. The 32-bit results are packed into a 64-bit integer +/// vector of [2 x i32]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PSRAD instruction. +/// +/// \param __m +/// A 64-bit integer vector of [2 x i32]. +/// \param __count +/// A 32-bit integer value. +/// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted +/// values. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_srai_pi32(__m64 __m, int __count) +{ + return (__m64)__builtin_ia32_psradi((__v2si)__m, __count); +} + +/// Right-shifts each 16-bit integer element of the first parameter, +/// which is a 64-bit integer vector of [4 x i16], by the number of bits +/// specified by the second parameter, which is a 64-bit integer. +/// +/// High-order bits are cleared. The 16-bit results are packed into a 64-bit +/// integer vector of [4 x i16]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PSRLW instruction. +/// +/// \param __m +/// A 64-bit integer vector of [4 x i16]. +/// \param __count +/// A 64-bit integer vector interpreted as a single 64-bit integer. +/// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted +/// values. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_srl_pi16(__m64 __m, __m64 __count) +{ + return (__m64)__builtin_ia32_psrlw((__v4hi)__m, __count); +} + +/// Right-shifts each 16-bit integer element of a 64-bit integer vector +/// of [4 x i16] by the number of bits specified by a 32-bit integer. +/// +/// High-order bits are cleared. The 16-bit results are packed into a 64-bit +/// integer vector of [4 x i16]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PSRLW instruction. +/// +/// \param __m +/// A 64-bit integer vector of [4 x i16]. +/// \param __count +/// A 32-bit integer value. +/// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted +/// values. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_srli_pi16(__m64 __m, int __count) +{ + return (__m64)__builtin_ia32_psrlwi((__v4hi)__m, __count); +} + +/// Right-shifts each 32-bit integer element of the first parameter, +/// which is a 64-bit integer vector of [2 x i32], by the number of bits +/// specified by the second parameter, which is a 64-bit integer. +/// +/// High-order bits are cleared. The 32-bit results are packed into a 64-bit +/// integer vector of [2 x i32]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PSRLD instruction. +/// +/// \param __m +/// A 64-bit integer vector of [2 x i32]. +/// \param __count +/// A 64-bit integer vector interpreted as a single 64-bit integer. +/// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted +/// values. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_srl_pi32(__m64 __m, __m64 __count) +{ + return (__m64)__builtin_ia32_psrld((__v2si)__m, __count); +} + +/// Right-shifts each 32-bit integer element of a 64-bit integer vector +/// of [2 x i32] by the number of bits specified by a 32-bit integer. +/// +/// High-order bits are cleared. The 32-bit results are packed into a 64-bit +/// integer vector of [2 x i32]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PSRLD instruction. +/// +/// \param __m +/// A 64-bit integer vector of [2 x i32]. +/// \param __count +/// A 32-bit integer value. +/// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted +/// values. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_srli_pi32(__m64 __m, int __count) +{ + return (__m64)__builtin_ia32_psrldi((__v2si)__m, __count); +} + +/// Right-shifts the first 64-bit integer parameter by the number of bits +/// specified by the second 64-bit integer parameter. +/// +/// High-order bits are cleared. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PSRLQ instruction. +/// +/// \param __m +/// A 64-bit integer vector interpreted as a single 64-bit integer. +/// \param __count +/// A 64-bit integer vector interpreted as a single 64-bit integer. +/// \returns A 64-bit integer vector containing the right-shifted value. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_srl_si64(__m64 __m, __m64 __count) +{ + return (__m64)__builtin_ia32_psrlq((__v1di)__m, __count); +} + +/// Right-shifts the first parameter, which is a 64-bit integer, by the +/// number of bits specified by the second parameter, which is a 32-bit +/// integer. +/// +/// High-order bits are cleared. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PSRLQ instruction. +/// +/// \param __m +/// A 64-bit integer vector interpreted as a single 64-bit integer. +/// \param __count +/// A 32-bit integer value. +/// \returns A 64-bit integer vector containing the right-shifted value. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_srli_si64(__m64 __m, int __count) +{ + return (__m64)__builtin_ia32_psrlqi((__v1di)__m, __count); +} + +/// Performs a bitwise AND of two 64-bit integer vectors. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PAND instruction. +/// +/// \param __m1 +/// A 64-bit integer vector. +/// \param __m2 +/// A 64-bit integer vector. +/// \returns A 64-bit integer vector containing the bitwise AND of both +/// parameters. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_and_si64(__m64 __m1, __m64 __m2) +{ + return __builtin_ia32_pand((__v1di)__m1, (__v1di)__m2); +} + +/// Performs a bitwise NOT of the first 64-bit integer vector, and then +/// performs a bitwise AND of the intermediate result and the second 64-bit +/// integer vector. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PANDN instruction. +/// +/// \param __m1 +/// A 64-bit integer vector. The one's complement of this parameter is used +/// in the bitwise AND. +/// \param __m2 +/// A 64-bit integer vector. +/// \returns A 64-bit integer vector containing the bitwise AND of the second +/// parameter and the one's complement of the first parameter. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_andnot_si64(__m64 __m1, __m64 __m2) +{ + return __builtin_ia32_pandn((__v1di)__m1, (__v1di)__m2); +} + +/// Performs a bitwise OR of two 64-bit integer vectors. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the POR instruction. +/// +/// \param __m1 +/// A 64-bit integer vector. +/// \param __m2 +/// A 64-bit integer vector. +/// \returns A 64-bit integer vector containing the bitwise OR of both +/// parameters. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_or_si64(__m64 __m1, __m64 __m2) +{ + return __builtin_ia32_por((__v1di)__m1, (__v1di)__m2); +} + +/// Performs a bitwise exclusive OR of two 64-bit integer vectors. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PXOR instruction. +/// +/// \param __m1 +/// A 64-bit integer vector. +/// \param __m2 +/// A 64-bit integer vector. +/// \returns A 64-bit integer vector containing the bitwise exclusive OR of both +/// parameters. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_xor_si64(__m64 __m1, __m64 __m2) +{ + return __builtin_ia32_pxor((__v1di)__m1, (__v1di)__m2); +} + +/// Compares the 8-bit integer elements of two 64-bit integer vectors of +/// [8 x i8] to determine if the element of the first vector is equal to the +/// corresponding element of the second vector. +/// +/// The comparison yields 0 for false, 0xFF for true. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PCMPEQB instruction. +/// +/// \param __m1 +/// A 64-bit integer vector of [8 x i8]. +/// \param __m2 +/// A 64-bit integer vector of [8 x i8]. +/// \returns A 64-bit integer vector of [8 x i8] containing the comparison +/// results. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_cmpeq_pi8(__m64 __m1, __m64 __m2) +{ + return (__m64)__builtin_ia32_pcmpeqb((__v8qi)__m1, (__v8qi)__m2); +} + +/// Compares the 16-bit integer elements of two 64-bit integer vectors of +/// [4 x i16] to determine if the element of the first vector is equal to the +/// corresponding element of the second vector. +/// +/// The comparison yields 0 for false, 0xFFFF for true. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PCMPEQW instruction. +/// +/// \param __m1 +/// A 64-bit integer vector of [4 x i16]. +/// \param __m2 +/// A 64-bit integer vector of [4 x i16]. +/// \returns A 64-bit integer vector of [4 x i16] containing the comparison +/// results. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_cmpeq_pi16(__m64 __m1, __m64 __m2) +{ + return (__m64)__builtin_ia32_pcmpeqw((__v4hi)__m1, (__v4hi)__m2); +} + +/// Compares the 32-bit integer elements of two 64-bit integer vectors of +/// [2 x i32] to determine if the element of the first vector is equal to the +/// corresponding element of the second vector. +/// +/// The comparison yields 0 for false, 0xFFFFFFFF for true. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PCMPEQD instruction. +/// +/// \param __m1 +/// A 64-bit integer vector of [2 x i32]. +/// \param __m2 +/// A 64-bit integer vector of [2 x i32]. +/// \returns A 64-bit integer vector of [2 x i32] containing the comparison +/// results. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_cmpeq_pi32(__m64 __m1, __m64 __m2) +{ + return (__m64)__builtin_ia32_pcmpeqd((__v2si)__m1, (__v2si)__m2); +} + +/// Compares the 8-bit integer elements of two 64-bit integer vectors of +/// [8 x i8] to determine if the element of the first vector is greater than +/// the corresponding element of the second vector. +/// +/// The comparison yields 0 for false, 0xFF for true. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PCMPGTB instruction. +/// +/// \param __m1 +/// A 64-bit integer vector of [8 x i8]. +/// \param __m2 +/// A 64-bit integer vector of [8 x i8]. +/// \returns A 64-bit integer vector of [8 x i8] containing the comparison +/// results. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_cmpgt_pi8(__m64 __m1, __m64 __m2) +{ + return (__m64)__builtin_ia32_pcmpgtb((__v8qi)__m1, (__v8qi)__m2); +} + +/// Compares the 16-bit integer elements of two 64-bit integer vectors of +/// [4 x i16] to determine if the element of the first vector is greater than +/// the corresponding element of the second vector. +/// +/// The comparison yields 0 for false, 0xFFFF for true. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PCMPGTW instruction. +/// +/// \param __m1 +/// A 64-bit integer vector of [4 x i16]. +/// \param __m2 +/// A 64-bit integer vector of [4 x i16]. +/// \returns A 64-bit integer vector of [4 x i16] containing the comparison +/// results. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_cmpgt_pi16(__m64 __m1, __m64 __m2) +{ + return (__m64)__builtin_ia32_pcmpgtw((__v4hi)__m1, (__v4hi)__m2); +} + +/// Compares the 32-bit integer elements of two 64-bit integer vectors of +/// [2 x i32] to determine if the element of the first vector is greater than +/// the corresponding element of the second vector. +/// +/// The comparison yields 0 for false, 0xFFFFFFFF for true. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PCMPGTD instruction. +/// +/// \param __m1 +/// A 64-bit integer vector of [2 x i32]. +/// \param __m2 +/// A 64-bit integer vector of [2 x i32]. +/// \returns A 64-bit integer vector of [2 x i32] containing the comparison +/// results. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_cmpgt_pi32(__m64 __m1, __m64 __m2) +{ + return (__m64)__builtin_ia32_pcmpgtd((__v2si)__m1, (__v2si)__m2); +} + +/// Constructs a 64-bit integer vector initialized to zero. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PXOR instruction. +/// +/// \returns An initialized 64-bit integer vector with all elements set to zero. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_setzero_si64(void) +{ + return __extension__ (__m64){ 0LL }; +} + +/// Constructs a 64-bit integer vector initialized with the specified +/// 32-bit integer values. +/// +/// \headerfile +/// +/// This intrinsic is a utility function and does not correspond to a specific +/// instruction. +/// +/// \param __i1 +/// A 32-bit integer value used to initialize the upper 32 bits of the +/// result. +/// \param __i0 +/// A 32-bit integer value used to initialize the lower 32 bits of the +/// result. +/// \returns An initialized 64-bit integer vector. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_set_pi32(int __i1, int __i0) +{ + return (__m64)__builtin_ia32_vec_init_v2si(__i0, __i1); +} + +/// Constructs a 64-bit integer vector initialized with the specified +/// 16-bit integer values. +/// +/// \headerfile +/// +/// This intrinsic is a utility function and does not correspond to a specific +/// instruction. +/// +/// \param __s3 +/// A 16-bit integer value used to initialize bits [63:48] of the result. +/// \param __s2 +/// A 16-bit integer value used to initialize bits [47:32] of the result. +/// \param __s1 +/// A 16-bit integer value used to initialize bits [31:16] of the result. +/// \param __s0 +/// A 16-bit integer value used to initialize bits [15:0] of the result. +/// \returns An initialized 64-bit integer vector. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_set_pi16(short __s3, short __s2, short __s1, short __s0) +{ + return (__m64)__builtin_ia32_vec_init_v4hi(__s0, __s1, __s2, __s3); +} + +/// Constructs a 64-bit integer vector initialized with the specified +/// 8-bit integer values. +/// +/// \headerfile +/// +/// This intrinsic is a utility function and does not correspond to a specific +/// instruction. +/// +/// \param __b7 +/// An 8-bit integer value used to initialize bits [63:56] of the result. +/// \param __b6 +/// An 8-bit integer value used to initialize bits [55:48] of the result. +/// \param __b5 +/// An 8-bit integer value used to initialize bits [47:40] of the result. +/// \param __b4 +/// An 8-bit integer value used to initialize bits [39:32] of the result. +/// \param __b3 +/// An 8-bit integer value used to initialize bits [31:24] of the result. +/// \param __b2 +/// An 8-bit integer value used to initialize bits [23:16] of the result. +/// \param __b1 +/// An 8-bit integer value used to initialize bits [15:8] of the result. +/// \param __b0 +/// An 8-bit integer value used to initialize bits [7:0] of the result. +/// \returns An initialized 64-bit integer vector. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, + char __b1, char __b0) +{ + return (__m64)__builtin_ia32_vec_init_v8qi(__b0, __b1, __b2, __b3, + __b4, __b5, __b6, __b7); +} + +/// Constructs a 64-bit integer vector of [2 x i32], with each of the +/// 32-bit integer vector elements set to the specified 32-bit integer +/// value. +/// +/// \headerfile +/// +/// This intrinsic is a utility function and does not correspond to a specific +/// instruction. +/// +/// \param __i +/// A 32-bit integer value used to initialize each vector element of the +/// result. +/// \returns An initialized 64-bit integer vector of [2 x i32]. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_set1_pi32(int __i) +{ + return _mm_set_pi32(__i, __i); +} + +/// Constructs a 64-bit integer vector of [4 x i16], with each of the +/// 16-bit integer vector elements set to the specified 16-bit integer +/// value. +/// +/// \headerfile +/// +/// This intrinsic is a utility function and does not correspond to a specific +/// instruction. +/// +/// \param __w +/// A 16-bit integer value used to initialize each vector element of the +/// result. +/// \returns An initialized 64-bit integer vector of [4 x i16]. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_set1_pi16(short __w) +{ + return _mm_set_pi16(__w, __w, __w, __w); +} + +/// Constructs a 64-bit integer vector of [8 x i8], with each of the +/// 8-bit integer vector elements set to the specified 8-bit integer value. +/// +/// \headerfile +/// +/// This intrinsic is a utility function and does not correspond to a specific +/// instruction. +/// +/// \param __b +/// An 8-bit integer value used to initialize each vector element of the +/// result. +/// \returns An initialized 64-bit integer vector of [8 x i8]. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_set1_pi8(char __b) +{ + return _mm_set_pi8(__b, __b, __b, __b, __b, __b, __b, __b); +} + +/// Constructs a 64-bit integer vector, initialized in reverse order with +/// the specified 32-bit integer values. +/// +/// \headerfile +/// +/// This intrinsic is a utility function and does not correspond to a specific +/// instruction. +/// +/// \param __i0 +/// A 32-bit integer value used to initialize the lower 32 bits of the +/// result. +/// \param __i1 +/// A 32-bit integer value used to initialize the upper 32 bits of the +/// result. +/// \returns An initialized 64-bit integer vector. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_setr_pi32(int __i0, int __i1) +{ + return _mm_set_pi32(__i1, __i0); +} + +/// Constructs a 64-bit integer vector, initialized in reverse order with +/// the specified 16-bit integer values. +/// +/// \headerfile +/// +/// This intrinsic is a utility function and does not correspond to a specific +/// instruction. +/// +/// \param __w0 +/// A 16-bit integer value used to initialize bits [15:0] of the result. +/// \param __w1 +/// A 16-bit integer value used to initialize bits [31:16] of the result. +/// \param __w2 +/// A 16-bit integer value used to initialize bits [47:32] of the result. +/// \param __w3 +/// A 16-bit integer value used to initialize bits [63:48] of the result. +/// \returns An initialized 64-bit integer vector. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_setr_pi16(short __w0, short __w1, short __w2, short __w3) +{ + return _mm_set_pi16(__w3, __w2, __w1, __w0); +} + +/// Constructs a 64-bit integer vector, initialized in reverse order with +/// the specified 8-bit integer values. +/// +/// \headerfile +/// +/// This intrinsic is a utility function and does not correspond to a specific +/// instruction. +/// +/// \param __b0 +/// An 8-bit integer value used to initialize bits [7:0] of the result. +/// \param __b1 +/// An 8-bit integer value used to initialize bits [15:8] of the result. +/// \param __b2 +/// An 8-bit integer value used to initialize bits [23:16] of the result. +/// \param __b3 +/// An 8-bit integer value used to initialize bits [31:24] of the result. +/// \param __b4 +/// An 8-bit integer value used to initialize bits [39:32] of the result. +/// \param __b5 +/// An 8-bit integer value used to initialize bits [47:40] of the result. +/// \param __b6 +/// An 8-bit integer value used to initialize bits [55:48] of the result. +/// \param __b7 +/// An 8-bit integer value used to initialize bits [63:56] of the result. +/// \returns An initialized 64-bit integer vector. +static __inline__ __m64 __DEFAULT_FN_ATTRS +_mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, + char __b6, char __b7) +{ + return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0); +} + +#undef __DEFAULT_FN_ATTRS + +/* Aliases for compatibility. */ +#define _m_empty _mm_empty +#define _m_from_int _mm_cvtsi32_si64 +#define _m_from_int64 _mm_cvtsi64_m64 +#define _m_to_int _mm_cvtsi64_si32 +#define _m_to_int64 _mm_cvtm64_si64 +#define _m_packsswb _mm_packs_pi16 +#define _m_packssdw _mm_packs_pi32 +#define _m_packuswb _mm_packs_pu16 +#define _m_punpckhbw _mm_unpackhi_pi8 +#define _m_punpckhwd _mm_unpackhi_pi16 +#define _m_punpckhdq _mm_unpackhi_pi32 +#define _m_punpcklbw _mm_unpacklo_pi8 +#define _m_punpcklwd _mm_unpacklo_pi16 +#define _m_punpckldq _mm_unpacklo_pi32 +#define _m_paddb _mm_add_pi8 +#define _m_paddw _mm_add_pi16 +#define _m_paddd _mm_add_pi32 +#define _m_paddsb _mm_adds_pi8 +#define _m_paddsw _mm_adds_pi16 +#define _m_paddusb _mm_adds_pu8 +#define _m_paddusw _mm_adds_pu16 +#define _m_psubb _mm_sub_pi8 +#define _m_psubw _mm_sub_pi16 +#define _m_psubd _mm_sub_pi32 +#define _m_psubsb _mm_subs_pi8 +#define _m_psubsw _mm_subs_pi16 +#define _m_psubusb _mm_subs_pu8 +#define _m_psubusw _mm_subs_pu16 +#define _m_pmaddwd _mm_madd_pi16 +#define _m_pmulhw _mm_mulhi_pi16 +#define _m_pmullw _mm_mullo_pi16 +#define _m_psllw _mm_sll_pi16 +#define _m_psllwi _mm_slli_pi16 +#define _m_pslld _mm_sll_pi32 +#define _m_pslldi _mm_slli_pi32 +#define _m_psllq _mm_sll_si64 +#define _m_psllqi _mm_slli_si64 +#define _m_psraw _mm_sra_pi16 +#define _m_psrawi _mm_srai_pi16 +#define _m_psrad _mm_sra_pi32 +#define _m_psradi _mm_srai_pi32 +#define _m_psrlw _mm_srl_pi16 +#define _m_psrlwi _mm_srli_pi16 +#define _m_psrld _mm_srl_pi32 +#define _m_psrldi _mm_srli_pi32 +#define _m_psrlq _mm_srl_si64 +#define _m_psrlqi _mm_srli_si64 +#define _m_pand _mm_and_si64 +#define _m_pandn _mm_andnot_si64 +#define _m_por _mm_or_si64 +#define _m_pxor _mm_xor_si64 +#define _m_pcmpeqb _mm_cmpeq_pi8 +#define _m_pcmpeqw _mm_cmpeq_pi16 +#define _m_pcmpeqd _mm_cmpeq_pi32 +#define _m_pcmpgtb _mm_cmpgt_pi8 +#define _m_pcmpgtw _mm_cmpgt_pi16 +#define _m_pcmpgtd _mm_cmpgt_pi32 + +#endif /* __MMINTRIN_H */ + diff --git a/include-llvm/movdirintrin.h b/include-llvm/movdirintrin.h new file mode 100644 index 0000000..30c4d02 --- /dev/null +++ b/include-llvm/movdirintrin.h @@ -0,0 +1,49 @@ +/*===------------------------- movdirintrin.h ------------------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef _MOVDIRINTRIN_H +#define _MOVDIRINTRIN_H + +/* Move doubleword as direct store */ +static __inline__ void +__attribute__((__always_inline__, __nodebug__, __target__("movdiri"))) +_directstoreu_u32 (void *__dst, unsigned int __value) +{ + __builtin_ia32_directstore_u32((unsigned int *)__dst, (unsigned int)__value); +} + +#ifdef __x86_64__ + +/* Move quadword as direct store */ +static __inline__ void +__attribute__((__always_inline__, __nodebug__, __target__("movdiri"))) +_directstoreu_u64 (void *__dst, unsigned long __value) +{ + __builtin_ia32_directstore_u64((unsigned long *)__dst, __value); +} + +#endif /* __x86_64__ */ + +/* + * movdir64b - Move 64 bytes as direct store. + * The destination must be 64 byte aligned, and the store is atomic. + * The source address has no alignment requirement, and the load from + * the source address is not atomic. + */ +static __inline__ void +__attribute__((__always_inline__, __nodebug__, __target__("movdir64b"))) +_movdir64b (void *__dst __attribute__((align_value(64))), const void *__src) +{ + __builtin_ia32_movdir64b(__dst, __src); +} + +#endif /* _MOVDIRINTRIN_H */ diff --git a/include-llvm/mwaitxintrin.h b/include-llvm/mwaitxintrin.h new file mode 100644 index 0000000..ed48538 --- /dev/null +++ b/include-llvm/mwaitxintrin.h @@ -0,0 +1,33 @@ +/*===---- mwaitxintrin.h - MONITORX/MWAITX intrinsics ----------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __X86INTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __MWAITXINTRIN_H +#define __MWAITXINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("mwaitx"))) +static __inline__ void __DEFAULT_FN_ATTRS +_mm_monitorx(void * __p, unsigned __extensions, unsigned __hints) +{ + __builtin_ia32_monitorx(__p, __extensions, __hints); +} + +static __inline__ void __DEFAULT_FN_ATTRS +_mm_mwaitx(unsigned __extensions, unsigned __hints, unsigned __clock) +{ + __builtin_ia32_mwaitx(__extensions, __hints, __clock); +} + +#undef __DEFAULT_FN_ATTRS + +#endif /* __MWAITXINTRIN_H */ diff --git a/include-llvm/nmmintrin.h b/include-llvm/nmmintrin.h new file mode 100644 index 0000000..59fc7ec --- /dev/null +++ b/include-llvm/nmmintrin.h @@ -0,0 +1,20 @@ +/*===---- nmmintrin.h - SSE4 intrinsics ------------------------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __NMMINTRIN_H +#define __NMMINTRIN_H + +#if !defined(__i386__) && !defined(__x86_64__) +#error "This header is only meant to be used on x86 and x64 architecture" +#endif + +/* To match expectations of gcc we put the sse4.2 definitions into smmintrin.h, + just include it now then. */ +#include +#endif /* __NMMINTRIN_H */ diff --git a/include-llvm/pconfigintrin.h b/include-llvm/pconfigintrin.h new file mode 100644 index 0000000..d2014b0 --- /dev/null +++ b/include-llvm/pconfigintrin.h @@ -0,0 +1,40 @@ +/*===---- pconfigintrin.h - X86 platform configuration ---------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __PCONFIGINTRIN_H +#define __PCONFIGINTRIN_H + +#define __PCONFIG_KEY_PROGRAM 0x00000001 + +#if __has_extension(gnu_asm) + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("pconfig"))) + +static __inline unsigned int __DEFAULT_FN_ATTRS +_pconfig_u32(unsigned int __leaf, __SIZE_TYPE__ __d[]) +{ + unsigned int __result; + __asm__ ("pconfig" + : "=a" (__result), "=b" (__d[0]), "=c" (__d[1]), "=d" (__d[2]) + : "a" (__leaf), "b" (__d[0]), "c" (__d[1]), "d" (__d[2]) + : "cc"); + return __result; +} + +#undef __DEFAULT_FN_ATTRS + +#endif /* __has_extension(gnu_asm) */ + +#endif diff --git a/include-llvm/pkuintrin.h b/include-llvm/pkuintrin.h new file mode 100644 index 0000000..c62080b --- /dev/null +++ b/include-llvm/pkuintrin.h @@ -0,0 +1,34 @@ +/*===---- pkuintrin.h - PKU intrinsics -------------------------------------=== + * + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __PKUINTRIN_H +#define __PKUINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("pku"))) + +static __inline__ unsigned int __DEFAULT_FN_ATTRS +_rdpkru_u32(void) +{ + return __builtin_ia32_rdpkru(); +} + +static __inline__ void __DEFAULT_FN_ATTRS +_wrpkru(unsigned int __val) +{ + __builtin_ia32_wrpkru(__val); +} + +#undef __DEFAULT_FN_ATTRS + +#endif diff --git a/include-llvm/pmmintrin.h b/include-llvm/pmmintrin.h new file mode 100644 index 0000000..eda8356 --- /dev/null +++ b/include-llvm/pmmintrin.h @@ -0,0 +1,294 @@ +/*===---- pmmintrin.h - SSE3 intrinsics ------------------------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __PMMINTRIN_H +#define __PMMINTRIN_H + +#if !defined(__i386__) && !defined(__x86_64__) +#error "This header is only meant to be used on x86 and x64 architecture" +#endif + +#include + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("sse3"), __min_vector_width__(128))) + +/// Loads data from an unaligned memory location to elements in a 128-bit +/// vector. +/// +/// If the address of the data is not 16-byte aligned, the instruction may +/// read two adjacent aligned blocks of memory to retrieve the requested +/// data. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VLDDQU instruction. +/// +/// \param __p +/// A pointer to a 128-bit integer vector containing integer values. +/// \returns A 128-bit vector containing the moved values. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_lddqu_si128(__m128i const *__p) +{ + return (__m128i)__builtin_ia32_lddqu((char const *)__p); +} + +/// Adds the even-indexed values and subtracts the odd-indexed values of +/// two 128-bit vectors of [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VADDSUBPS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing the left source operand. +/// \param __b +/// A 128-bit vector of [4 x float] containing the right source operand. +/// \returns A 128-bit vector of [4 x float] containing the alternating sums and +/// differences of both operands. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_addsub_ps(__m128 __a, __m128 __b) +{ + return __builtin_ia32_addsubps((__v4sf)__a, (__v4sf)__b); +} + +/// Horizontally adds the adjacent pairs of values contained in two +/// 128-bit vectors of [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VHADDPS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing one of the source operands. +/// The horizontal sums of the values are stored in the lower bits of the +/// destination. +/// \param __b +/// A 128-bit vector of [4 x float] containing one of the source operands. +/// The horizontal sums of the values are stored in the upper bits of the +/// destination. +/// \returns A 128-bit vector of [4 x float] containing the horizontal sums of +/// both operands. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_hadd_ps(__m128 __a, __m128 __b) +{ + return __builtin_ia32_haddps((__v4sf)__a, (__v4sf)__b); +} + +/// Horizontally subtracts the adjacent pairs of values contained in two +/// 128-bit vectors of [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VHSUBPS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing one of the source operands. +/// The horizontal differences between the values are stored in the lower +/// bits of the destination. +/// \param __b +/// A 128-bit vector of [4 x float] containing one of the source operands. +/// The horizontal differences between the values are stored in the upper +/// bits of the destination. +/// \returns A 128-bit vector of [4 x float] containing the horizontal +/// differences of both operands. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_hsub_ps(__m128 __a, __m128 __b) +{ + return __builtin_ia32_hsubps((__v4sf)__a, (__v4sf)__b); +} + +/// Moves and duplicates odd-indexed values from a 128-bit vector +/// of [4 x float] to float values stored in a 128-bit vector of +/// [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVSHDUP instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. \n +/// Bits [127:96] of the source are written to bits [127:96] and [95:64] of +/// the destination. \n +/// Bits [63:32] of the source are written to bits [63:32] and [31:0] of the +/// destination. +/// \returns A 128-bit vector of [4 x float] containing the moved and duplicated +/// values. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_movehdup_ps(__m128 __a) +{ + return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 1, 1, 3, 3); +} + +/// Duplicates even-indexed values from a 128-bit vector of +/// [4 x float] to float values stored in a 128-bit vector of [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVSLDUP instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float] \n +/// Bits [95:64] of the source are written to bits [127:96] and [95:64] of +/// the destination. \n +/// Bits [31:0] of the source are written to bits [63:32] and [31:0] of the +/// destination. +/// \returns A 128-bit vector of [4 x float] containing the moved and duplicated +/// values. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_moveldup_ps(__m128 __a) +{ + return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 2, 2); +} + +/// Adds the even-indexed values and subtracts the odd-indexed values of +/// two 128-bit vectors of [2 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VADDSUBPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing the left source operand. +/// \param __b +/// A 128-bit vector of [2 x double] containing the right source operand. +/// \returns A 128-bit vector of [2 x double] containing the alternating sums +/// and differences of both operands. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_addsub_pd(__m128d __a, __m128d __b) +{ + return __builtin_ia32_addsubpd((__v2df)__a, (__v2df)__b); +} + +/// Horizontally adds the pairs of values contained in two 128-bit +/// vectors of [2 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VHADDPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing one of the source operands. +/// The horizontal sum of the values is stored in the lower bits of the +/// destination. +/// \param __b +/// A 128-bit vector of [2 x double] containing one of the source operands. +/// The horizontal sum of the values is stored in the upper bits of the +/// destination. +/// \returns A 128-bit vector of [2 x double] containing the horizontal sums of +/// both operands. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_hadd_pd(__m128d __a, __m128d __b) +{ + return __builtin_ia32_haddpd((__v2df)__a, (__v2df)__b); +} + +/// Horizontally subtracts the pairs of values contained in two 128-bit +/// vectors of [2 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VHSUBPD instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double] containing one of the source operands. +/// The horizontal difference of the values is stored in the lower bits of +/// the destination. +/// \param __b +/// A 128-bit vector of [2 x double] containing one of the source operands. +/// The horizontal difference of the values is stored in the upper bits of +/// the destination. +/// \returns A 128-bit vector of [2 x double] containing the horizontal +/// differences of both operands. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_hsub_pd(__m128d __a, __m128d __b) +{ + return __builtin_ia32_hsubpd((__v2df)__a, (__v2df)__b); +} + +/// Moves and duplicates one double-precision value to double-precision +/// values stored in a 128-bit vector of [2 x double]. +/// +/// \headerfile +/// +/// \code +/// __m128d _mm_loaddup_pd(double const *dp); +/// \endcode +/// +/// This intrinsic corresponds to the VMOVDDUP instruction. +/// +/// \param dp +/// A pointer to a double-precision value to be moved and duplicated. +/// \returns A 128-bit vector of [2 x double] containing the moved and +/// duplicated values. +#define _mm_loaddup_pd(dp) _mm_load1_pd(dp) + +/// Moves and duplicates the double-precision value in the lower bits of +/// a 128-bit vector of [2 x double] to double-precision values stored in a +/// 128-bit vector of [2 x double]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVDDUP instruction. +/// +/// \param __a +/// A 128-bit vector of [2 x double]. Bits [63:0] are written to bits +/// [127:64] and [63:0] of the destination. +/// \returns A 128-bit vector of [2 x double] containing the moved and +/// duplicated values. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_movedup_pd(__m128d __a) +{ + return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0); +} + +/// Establishes a linear address memory range to be monitored and puts +/// the processor in the monitor event pending state. Data stored in the +/// monitored address range causes the processor to exit the pending state. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the MONITOR instruction. +/// +/// \param __p +/// The memory range to be monitored. The size of the range is determined by +/// CPUID function 0000_0005h. +/// \param __extensions +/// Optional extensions for the monitoring state. +/// \param __hints +/// Optional hints for the monitoring state. +static __inline__ void __DEFAULT_FN_ATTRS +_mm_monitor(void const *__p, unsigned __extensions, unsigned __hints) +{ + __builtin_ia32_monitor(__p, __extensions, __hints); +} + +/// Used with the MONITOR instruction to wait while the processor is in +/// the monitor event pending state. Data stored in the monitored address +/// range causes the processor to exit the pending state. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the MWAIT instruction. +/// +/// \param __extensions +/// Optional extensions for the monitoring state, which may vary by +/// processor. +/// \param __hints +/// Optional hints for the monitoring state, which may vary by processor. +static __inline__ void __DEFAULT_FN_ATTRS +_mm_mwait(unsigned __extensions, unsigned __hints) +{ + __builtin_ia32_mwait(__extensions, __hints); +} + +#undef __DEFAULT_FN_ATTRS + +#endif /* __PMMINTRIN_H */ diff --git a/include-llvm/popcntintrin.h b/include-llvm/popcntintrin.h new file mode 100644 index 0000000..0aa94ae --- /dev/null +++ b/include-llvm/popcntintrin.h @@ -0,0 +1,59 @@ +/*===---- popcntintrin.h - POPCNT intrinsics -------------------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __POPCNTINTRIN_H +#define __POPCNTINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("popcnt"))) + +#if defined(__cplusplus) && (__cplusplus >= 201103L) +#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr +#else +#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS +#endif + +/// Counts the number of bits in the source operand having a value of 1. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the POPCNT instruction. +/// +/// \param __A +/// An unsigned 32-bit integer operand. +/// \returns A 32-bit integer containing the number of bits with value 1 in the +/// source operand. +static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_popcnt_u32(unsigned int __A) +{ + return __builtin_popcount(__A); +} + +#ifdef __x86_64__ +/// Counts the number of bits in the source operand having a value of 1. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the POPCNT instruction. +/// +/// \param __A +/// An unsigned 64-bit integer operand. +/// \returns A 64-bit integer containing the number of bits with value 1 in the +/// source operand. +static __inline__ long long __DEFAULT_FN_ATTRS_CONSTEXPR +_mm_popcnt_u64(unsigned long long __A) +{ + return __builtin_popcountll(__A); +} +#endif /* __x86_64__ */ + +#undef __DEFAULT_FN_ATTRS +#undef __DEFAULT_FN_ATTRS_CONSTEXPR + +#endif /* __POPCNTINTRIN_H */ diff --git a/include-llvm/prfchwintrin.h b/include-llvm/prfchwintrin.h new file mode 100644 index 0000000..d2f91aa --- /dev/null +++ b/include-llvm/prfchwintrin.h @@ -0,0 +1,58 @@ +/*===---- prfchwintrin.h - PREFETCHW intrinsic -----------------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#if !defined(__X86INTRIN_H) && !defined(_MM3DNOW_H_INCLUDED) +#error "Never use directly; include or instead." +#endif + +#ifndef __PRFCHWINTRIN_H +#define __PRFCHWINTRIN_H + +/// Loads a memory sequence containing the specified memory address into +/// all data cache levels. The cache-coherency state is set to exclusive. +/// Data can be read from and written to the cache line without additional +/// delay. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c PREFETCHT0 instruction. +/// +/// \param __P +/// A pointer specifying the memory address to be prefetched. +static __inline__ void __attribute__((__always_inline__, __nodebug__)) +_m_prefetch(void *__P) +{ + __builtin_prefetch (__P, 0, 3 /* _MM_HINT_T0 */); +} + +/// Loads a memory sequence containing the specified memory address into +/// the L1 data cache and sets the cache-coherency to modified. This +/// provides a hint to the processor that the cache line will be modified. +/// It is intended for use when the cache line will be written to shortly +/// after the prefetch is performed. +/// +/// Note that the effect of this intrinsic is dependent on the processor +/// implementation. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c PREFETCHW instruction. +/// +/// \param __P +/// A pointer specifying the memory address to be prefetched. +static __inline__ void __attribute__((__always_inline__, __nodebug__)) +_m_prefetchw(volatile const void *__P) +{ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wcast-qual" + __builtin_prefetch ((const void*)__P, 1, 3 /* _MM_HINT_T0 */); +#pragma clang diagnostic pop +} + +#endif /* __PRFCHWINTRIN_H */ diff --git a/include-llvm/ptwriteintrin.h b/include-llvm/ptwriteintrin.h new file mode 100644 index 0000000..0a04f7c --- /dev/null +++ b/include-llvm/ptwriteintrin.h @@ -0,0 +1,37 @@ +/*===------------ ptwriteintrin.h - PTWRITE intrinsic --------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __PTWRITEINTRIN_H +#define __PTWRITEINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("ptwrite"))) + +static __inline__ void __DEFAULT_FN_ATTRS +_ptwrite32(unsigned int __value) { + __builtin_ia32_ptwrite32(__value); +} + +#ifdef __x86_64__ + +static __inline__ void __DEFAULT_FN_ATTRS +_ptwrite64(unsigned long long __value) { + __builtin_ia32_ptwrite64(__value); +} + +#endif /* __x86_64__ */ + +#undef __DEFAULT_FN_ATTRS + +#endif /* __PTWRITEINTRIN_H */ diff --git a/include-llvm/rdseedintrin.h b/include-llvm/rdseedintrin.h new file mode 100644 index 0000000..ccb3d2d --- /dev/null +++ b/include-llvm/rdseedintrin.h @@ -0,0 +1,42 @@ +/*===---- rdseedintrin.h - RDSEED intrinsics -------------------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __RDSEEDINTRIN_H +#define __RDSEEDINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("rdseed"))) + +static __inline__ int __DEFAULT_FN_ATTRS +_rdseed16_step(unsigned short *__p) +{ + return __builtin_ia32_rdseed16_step(__p); +} + +static __inline__ int __DEFAULT_FN_ATTRS +_rdseed32_step(unsigned int *__p) +{ + return __builtin_ia32_rdseed32_step(__p); +} + +#ifdef __x86_64__ +static __inline__ int __DEFAULT_FN_ATTRS +_rdseed64_step(unsigned long long *__p) +{ + return __builtin_ia32_rdseed64_step(__p); +} +#endif + +#undef __DEFAULT_FN_ATTRS + +#endif /* __RDSEEDINTRIN_H */ diff --git a/include-llvm/rtmintrin.h b/include-llvm/rtmintrin.h new file mode 100644 index 0000000..36ff583 --- /dev/null +++ b/include-llvm/rtmintrin.h @@ -0,0 +1,45 @@ +/*===---- rtmintrin.h - RTM intrinsics -------------------------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __RTMINTRIN_H +#define __RTMINTRIN_H + +#define _XBEGIN_STARTED (~0u) +#define _XABORT_EXPLICIT (1 << 0) +#define _XABORT_RETRY (1 << 1) +#define _XABORT_CONFLICT (1 << 2) +#define _XABORT_CAPACITY (1 << 3) +#define _XABORT_DEBUG (1 << 4) +#define _XABORT_NESTED (1 << 5) +#define _XABORT_CODE(x) (((x) >> 24) & 0xFF) + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("rtm"))) + +static __inline__ unsigned int __DEFAULT_FN_ATTRS +_xbegin(void) +{ + return __builtin_ia32_xbegin(); +} + +static __inline__ void __DEFAULT_FN_ATTRS +_xend(void) +{ + __builtin_ia32_xend(); +} + +#define _xabort(imm) __builtin_ia32_xabort((imm)) + +#undef __DEFAULT_FN_ATTRS + +#endif /* __RTMINTRIN_H */ diff --git a/include-llvm/serializeintrin.h b/include-llvm/serializeintrin.h new file mode 100644 index 0000000..b774e5a --- /dev/null +++ b/include-llvm/serializeintrin.h @@ -0,0 +1,30 @@ +/*===--------------- serializeintrin.h - serialize intrinsics --------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __SERIALIZEINTRIN_H +#define __SERIALIZEINTRIN_H + +/// Serialize instruction fetch and execution. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the SERIALIZE instruction. +/// +static __inline__ void +__attribute__((__always_inline__, __nodebug__, __target__("serialize"))) +_serialize (void) +{ + __builtin_ia32_serialize (); +} + +#endif /* __SERIALIZEINTRIN_H */ diff --git a/include-llvm/sgxintrin.h b/include-llvm/sgxintrin.h new file mode 100644 index 0000000..303a21f --- /dev/null +++ b/include-llvm/sgxintrin.h @@ -0,0 +1,60 @@ +/*===---- sgxintrin.h - X86 SGX intrinsics configuration -------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __SGXINTRIN_H +#define __SGXINTRIN_H + +#if __has_extension(gnu_asm) + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("sgx"))) + +static __inline unsigned int __DEFAULT_FN_ATTRS +_enclu_u32(unsigned int __leaf, __SIZE_TYPE__ __d[]) +{ + unsigned int __result; + __asm__ ("enclu" + : "=a" (__result), "=b" (__d[0]), "=c" (__d[1]), "=d" (__d[2]) + : "a" (__leaf), "b" (__d[0]), "c" (__d[1]), "d" (__d[2]) + : "cc"); + return __result; +} + +static __inline unsigned int __DEFAULT_FN_ATTRS +_encls_u32(unsigned int __leaf, __SIZE_TYPE__ __d[]) +{ + unsigned int __result; + __asm__ ("encls" + : "=a" (__result), "=b" (__d[0]), "=c" (__d[1]), "=d" (__d[2]) + : "a" (__leaf), "b" (__d[0]), "c" (__d[1]), "d" (__d[2]) + : "cc"); + return __result; +} + +static __inline unsigned int __DEFAULT_FN_ATTRS +_enclv_u32(unsigned int __leaf, __SIZE_TYPE__ __d[]) +{ + unsigned int __result; + __asm__ ("enclv" + : "=a" (__result), "=b" (__d[0]), "=c" (__d[1]), "=d" (__d[2]) + : "a" (__leaf), "b" (__d[0]), "c" (__d[1]), "d" (__d[2]) + : "cc"); + return __result; +} + +#undef __DEFAULT_FN_ATTRS + +#endif /* __has_extension(gnu_asm) */ + +#endif diff --git a/include-llvm/shaintrin.h b/include-llvm/shaintrin.h new file mode 100644 index 0000000..08b1fb1 --- /dev/null +++ b/include-llvm/shaintrin.h @@ -0,0 +1,61 @@ +/*===---- shaintrin.h - SHA intrinsics -------------------------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __SHAINTRIN_H +#define __SHAINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sha"), __min_vector_width__(128))) + +#define _mm_sha1rnds4_epu32(V1, V2, M) \ + __builtin_ia32_sha1rnds4((__v4si)(__m128i)(V1), (__v4si)(__m128i)(V2), (M)) + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_sha1nexte_epu32(__m128i __X, __m128i __Y) +{ + return (__m128i)__builtin_ia32_sha1nexte((__v4si)__X, (__v4si)__Y); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_sha1msg1_epu32(__m128i __X, __m128i __Y) +{ + return (__m128i)__builtin_ia32_sha1msg1((__v4si)__X, (__v4si)__Y); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_sha1msg2_epu32(__m128i __X, __m128i __Y) +{ + return (__m128i)__builtin_ia32_sha1msg2((__v4si)__X, (__v4si)__Y); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_sha256rnds2_epu32(__m128i __X, __m128i __Y, __m128i __Z) +{ + return (__m128i)__builtin_ia32_sha256rnds2((__v4si)__X, (__v4si)__Y, (__v4si)__Z); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_sha256msg1_epu32(__m128i __X, __m128i __Y) +{ + return (__m128i)__builtin_ia32_sha256msg1((__v4si)__X, (__v4si)__Y); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_sha256msg2_epu32(__m128i __X, __m128i __Y) +{ + return (__m128i)__builtin_ia32_sha256msg2((__v4si)__X, (__v4si)__Y); +} + +#undef __DEFAULT_FN_ATTRS + +#endif /* __SHAINTRIN_H */ diff --git a/include-llvm/smmintrin.h b/include-llvm/smmintrin.h new file mode 100644 index 0000000..aff83ee --- /dev/null +++ b/include-llvm/smmintrin.h @@ -0,0 +1,2383 @@ +/*===---- smmintrin.h - SSE4 intrinsics ------------------------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __SMMINTRIN_H +#define __SMMINTRIN_H + +#if !defined(__i386__) && !defined(__x86_64__) +#error "This header is only meant to be used on x86 and x64 architecture" +#endif + +#include + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.1"), __min_vector_width__(128))) + +/* SSE4 Rounding macros. */ +#define _MM_FROUND_TO_NEAREST_INT 0x00 +#define _MM_FROUND_TO_NEG_INF 0x01 +#define _MM_FROUND_TO_POS_INF 0x02 +#define _MM_FROUND_TO_ZERO 0x03 +#define _MM_FROUND_CUR_DIRECTION 0x04 + +#define _MM_FROUND_RAISE_EXC 0x00 +#define _MM_FROUND_NO_EXC 0x08 + +#define _MM_FROUND_NINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT) +#define _MM_FROUND_FLOOR (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF) +#define _MM_FROUND_CEIL (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF) +#define _MM_FROUND_TRUNC (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO) +#define _MM_FROUND_RINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION) +#define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION) + +/// Rounds up each element of the 128-bit vector of [4 x float] to an +/// integer and returns the rounded values in a 128-bit vector of +/// [4 x float]. +/// +/// \headerfile +/// +/// \code +/// __m128 _mm_ceil_ps(__m128 X); +/// \endcode +/// +/// This intrinsic corresponds to the VROUNDPS / ROUNDPS instruction. +/// +/// \param X +/// A 128-bit vector of [4 x float] values to be rounded up. +/// \returns A 128-bit vector of [4 x float] containing the rounded values. +#define _mm_ceil_ps(X) _mm_round_ps((X), _MM_FROUND_CEIL) + +/// Rounds up each element of the 128-bit vector of [2 x double] to an +/// integer and returns the rounded values in a 128-bit vector of +/// [2 x double]. +/// +/// \headerfile +/// +/// \code +/// __m128d _mm_ceil_pd(__m128d X); +/// \endcode +/// +/// This intrinsic corresponds to the VROUNDPD / ROUNDPD instruction. +/// +/// \param X +/// A 128-bit vector of [2 x double] values to be rounded up. +/// \returns A 128-bit vector of [2 x double] containing the rounded values. +#define _mm_ceil_pd(X) _mm_round_pd((X), _MM_FROUND_CEIL) + +/// Copies three upper elements of the first 128-bit vector operand to +/// the corresponding three upper elements of the 128-bit result vector of +/// [4 x float]. Rounds up the lowest element of the second 128-bit vector +/// operand to an integer and copies it to the lowest element of the 128-bit +/// result vector of [4 x float]. +/// +/// \headerfile +/// +/// \code +/// __m128 _mm_ceil_ss(__m128 X, __m128 Y); +/// \endcode +/// +/// This intrinsic corresponds to the VROUNDSS / ROUNDSS instruction. +/// +/// \param X +/// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are +/// copied to the corresponding bits of the result. +/// \param Y +/// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is +/// rounded up to the nearest integer and copied to the corresponding bits +/// of the result. +/// \returns A 128-bit vector of [4 x float] containing the copied and rounded +/// values. +#define _mm_ceil_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_CEIL) + +/// Copies the upper element of the first 128-bit vector operand to the +/// corresponding upper element of the 128-bit result vector of [2 x double]. +/// Rounds up the lower element of the second 128-bit vector operand to an +/// integer and copies it to the lower element of the 128-bit result vector +/// of [2 x double]. +/// +/// \headerfile +/// +/// \code +/// __m128d _mm_ceil_sd(__m128d X, __m128d Y); +/// \endcode +/// +/// This intrinsic corresponds to the VROUNDSD / ROUNDSD instruction. +/// +/// \param X +/// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is +/// copied to the corresponding bits of the result. +/// \param Y +/// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is +/// rounded up to the nearest integer and copied to the corresponding bits +/// of the result. +/// \returns A 128-bit vector of [2 x double] containing the copied and rounded +/// values. +#define _mm_ceil_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_CEIL) + +/// Rounds down each element of the 128-bit vector of [4 x float] to an +/// an integer and returns the rounded values in a 128-bit vector of +/// [4 x float]. +/// +/// \headerfile +/// +/// \code +/// __m128 _mm_floor_ps(__m128 X); +/// \endcode +/// +/// This intrinsic corresponds to the VROUNDPS / ROUNDPS instruction. +/// +/// \param X +/// A 128-bit vector of [4 x float] values to be rounded down. +/// \returns A 128-bit vector of [4 x float] containing the rounded values. +#define _mm_floor_ps(X) _mm_round_ps((X), _MM_FROUND_FLOOR) + +/// Rounds down each element of the 128-bit vector of [2 x double] to an +/// integer and returns the rounded values in a 128-bit vector of +/// [2 x double]. +/// +/// \headerfile +/// +/// \code +/// __m128d _mm_floor_pd(__m128d X); +/// \endcode +/// +/// This intrinsic corresponds to the VROUNDPD / ROUNDPD instruction. +/// +/// \param X +/// A 128-bit vector of [2 x double]. +/// \returns A 128-bit vector of [2 x double] containing the rounded values. +#define _mm_floor_pd(X) _mm_round_pd((X), _MM_FROUND_FLOOR) + +/// Copies three upper elements of the first 128-bit vector operand to +/// the corresponding three upper elements of the 128-bit result vector of +/// [4 x float]. Rounds down the lowest element of the second 128-bit vector +/// operand to an integer and copies it to the lowest element of the 128-bit +/// result vector of [4 x float]. +/// +/// \headerfile +/// +/// \code +/// __m128 _mm_floor_ss(__m128 X, __m128 Y); +/// \endcode +/// +/// This intrinsic corresponds to the VROUNDSS / ROUNDSS instruction. +/// +/// \param X +/// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are +/// copied to the corresponding bits of the result. +/// \param Y +/// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is +/// rounded down to the nearest integer and copied to the corresponding bits +/// of the result. +/// \returns A 128-bit vector of [4 x float] containing the copied and rounded +/// values. +#define _mm_floor_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_FLOOR) + +/// Copies the upper element of the first 128-bit vector operand to the +/// corresponding upper element of the 128-bit result vector of [2 x double]. +/// Rounds down the lower element of the second 128-bit vector operand to an +/// integer and copies it to the lower element of the 128-bit result vector +/// of [2 x double]. +/// +/// \headerfile +/// +/// \code +/// __m128d _mm_floor_sd(__m128d X, __m128d Y); +/// \endcode +/// +/// This intrinsic corresponds to the VROUNDSD / ROUNDSD instruction. +/// +/// \param X +/// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is +/// copied to the corresponding bits of the result. +/// \param Y +/// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is +/// rounded down to the nearest integer and copied to the corresponding bits +/// of the result. +/// \returns A 128-bit vector of [2 x double] containing the copied and rounded +/// values. +#define _mm_floor_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_FLOOR) + +/// Rounds each element of the 128-bit vector of [4 x float] to an +/// integer value according to the rounding control specified by the second +/// argument and returns the rounded values in a 128-bit vector of +/// [4 x float]. +/// +/// \headerfile +/// +/// \code +/// __m128 _mm_round_ps(__m128 X, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the VROUNDPS / ROUNDPS instruction. +/// +/// \param X +/// A 128-bit vector of [4 x float]. +/// \param M +/// An integer value that specifies the rounding operation. \n +/// Bits [7:4] are reserved. \n +/// Bit [3] is a precision exception value: \n +/// 0: A normal PE exception is used \n +/// 1: The PE field is not updated \n +/// Bit [2] is the rounding control source: \n +/// 0: Use bits [1:0] of \a M \n +/// 1: Use the current MXCSR setting \n +/// Bits [1:0] contain the rounding control definition: \n +/// 00: Nearest \n +/// 01: Downward (toward negative infinity) \n +/// 10: Upward (toward positive infinity) \n +/// 11: Truncated +/// \returns A 128-bit vector of [4 x float] containing the rounded values. +#define _mm_round_ps(X, M) \ + ((__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M))) + +/// Copies three upper elements of the first 128-bit vector operand to +/// the corresponding three upper elements of the 128-bit result vector of +/// [4 x float]. Rounds the lowest element of the second 128-bit vector +/// operand to an integer value according to the rounding control specified +/// by the third argument and copies it to the lowest element of the 128-bit +/// result vector of [4 x float]. +/// +/// \headerfile +/// +/// \code +/// __m128 _mm_round_ss(__m128 X, __m128 Y, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the VROUNDSS / ROUNDSS instruction. +/// +/// \param X +/// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are +/// copied to the corresponding bits of the result. +/// \param Y +/// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is +/// rounded to the nearest integer using the specified rounding control and +/// copied to the corresponding bits of the result. +/// \param M +/// An integer value that specifies the rounding operation. \n +/// Bits [7:4] are reserved. \n +/// Bit [3] is a precision exception value: \n +/// 0: A normal PE exception is used \n +/// 1: The PE field is not updated \n +/// Bit [2] is the rounding control source: \n +/// 0: Use bits [1:0] of \a M \n +/// 1: Use the current MXCSR setting \n +/// Bits [1:0] contain the rounding control definition: \n +/// 00: Nearest \n +/// 01: Downward (toward negative infinity) \n +/// 10: Upward (toward positive infinity) \n +/// 11: Truncated +/// \returns A 128-bit vector of [4 x float] containing the copied and rounded +/// values. +#define _mm_round_ss(X, Y, M) \ + ((__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (M))) + +/// Rounds each element of the 128-bit vector of [2 x double] to an +/// integer value according to the rounding control specified by the second +/// argument and returns the rounded values in a 128-bit vector of +/// [2 x double]. +/// +/// \headerfile +/// +/// \code +/// __m128d _mm_round_pd(__m128d X, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the VROUNDPD / ROUNDPD instruction. +/// +/// \param X +/// A 128-bit vector of [2 x double]. +/// \param M +/// An integer value that specifies the rounding operation. \n +/// Bits [7:4] are reserved. \n +/// Bit [3] is a precision exception value: \n +/// 0: A normal PE exception is used \n +/// 1: The PE field is not updated \n +/// Bit [2] is the rounding control source: \n +/// 0: Use bits [1:0] of \a M \n +/// 1: Use the current MXCSR setting \n +/// Bits [1:0] contain the rounding control definition: \n +/// 00: Nearest \n +/// 01: Downward (toward negative infinity) \n +/// 10: Upward (toward positive infinity) \n +/// 11: Truncated +/// \returns A 128-bit vector of [2 x double] containing the rounded values. +#define _mm_round_pd(X, M) \ + ((__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M))) + +/// Copies the upper element of the first 128-bit vector operand to the +/// corresponding upper element of the 128-bit result vector of [2 x double]. +/// Rounds the lower element of the second 128-bit vector operand to an +/// integer value according to the rounding control specified by the third +/// argument and copies it to the lower element of the 128-bit result vector +/// of [2 x double]. +/// +/// \headerfile +/// +/// \code +/// __m128d _mm_round_sd(__m128d X, __m128d Y, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the VROUNDSD / ROUNDSD instruction. +/// +/// \param X +/// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is +/// copied to the corresponding bits of the result. +/// \param Y +/// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is +/// rounded to the nearest integer using the specified rounding control and +/// copied to the corresponding bits of the result. +/// \param M +/// An integer value that specifies the rounding operation. \n +/// Bits [7:4] are reserved. \n +/// Bit [3] is a precision exception value: \n +/// 0: A normal PE exception is used \n +/// 1: The PE field is not updated \n +/// Bit [2] is the rounding control source: \n +/// 0: Use bits [1:0] of \a M \n +/// 1: Use the current MXCSR setting \n +/// Bits [1:0] contain the rounding control definition: \n +/// 00: Nearest \n +/// 01: Downward (toward negative infinity) \n +/// 10: Upward (toward positive infinity) \n +/// 11: Truncated +/// \returns A 128-bit vector of [2 x double] containing the copied and rounded +/// values. +#define _mm_round_sd(X, Y, M) \ + ((__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), (M))) + +/* SSE4 Packed Blending Intrinsics. */ +/// Returns a 128-bit vector of [2 x double] where the values are +/// selected from either the first or second operand as specified by the +/// third operand, the control mask. +/// +/// \headerfile +/// +/// \code +/// __m128d _mm_blend_pd(__m128d V1, __m128d V2, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the VBLENDPD / BLENDPD instruction. +/// +/// \param V1 +/// A 128-bit vector of [2 x double]. +/// \param V2 +/// A 128-bit vector of [2 x double]. +/// \param M +/// An immediate integer operand, with mask bits [1:0] specifying how the +/// values are to be copied. The position of the mask bit corresponds to the +/// index of a copied value. When a mask bit is 0, the corresponding 64-bit +/// element in operand \a V1 is copied to the same position in the result. +/// When a mask bit is 1, the corresponding 64-bit element in operand \a V2 +/// is copied to the same position in the result. +/// \returns A 128-bit vector of [2 x double] containing the copied values. +#define _mm_blend_pd(V1, V2, M) \ + ((__m128d) __builtin_ia32_blendpd ((__v2df)(__m128d)(V1), \ + (__v2df)(__m128d)(V2), (int)(M))) + +/// Returns a 128-bit vector of [4 x float] where the values are selected +/// from either the first or second operand as specified by the third +/// operand, the control mask. +/// +/// \headerfile +/// +/// \code +/// __m128 _mm_blend_ps(__m128 V1, __m128 V2, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the VBLENDPS / BLENDPS instruction. +/// +/// \param V1 +/// A 128-bit vector of [4 x float]. +/// \param V2 +/// A 128-bit vector of [4 x float]. +/// \param M +/// An immediate integer operand, with mask bits [3:0] specifying how the +/// values are to be copied. The position of the mask bit corresponds to the +/// index of a copied value. When a mask bit is 0, the corresponding 32-bit +/// element in operand \a V1 is copied to the same position in the result. +/// When a mask bit is 1, the corresponding 32-bit element in operand \a V2 +/// is copied to the same position in the result. +/// \returns A 128-bit vector of [4 x float] containing the copied values. +#define _mm_blend_ps(V1, V2, M) \ + ((__m128) __builtin_ia32_blendps ((__v4sf)(__m128)(V1), \ + (__v4sf)(__m128)(V2), (int)(M))) + +/// Returns a 128-bit vector of [2 x double] where the values are +/// selected from either the first or second operand as specified by the +/// third operand, the control mask. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VBLENDVPD / BLENDVPD instruction. +/// +/// \param __V1 +/// A 128-bit vector of [2 x double]. +/// \param __V2 +/// A 128-bit vector of [2 x double]. +/// \param __M +/// A 128-bit vector operand, with mask bits 127 and 63 specifying how the +/// values are to be copied. The position of the mask bit corresponds to the +/// most significant bit of a copied value. When a mask bit is 0, the +/// corresponding 64-bit element in operand \a __V1 is copied to the same +/// position in the result. When a mask bit is 1, the corresponding 64-bit +/// element in operand \a __V2 is copied to the same position in the result. +/// \returns A 128-bit vector of [2 x double] containing the copied values. +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_blendv_pd (__m128d __V1, __m128d __V2, __m128d __M) +{ + return (__m128d) __builtin_ia32_blendvpd ((__v2df)__V1, (__v2df)__V2, + (__v2df)__M); +} + +/// Returns a 128-bit vector of [4 x float] where the values are +/// selected from either the first or second operand as specified by the +/// third operand, the control mask. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VBLENDVPS / BLENDVPS instruction. +/// +/// \param __V1 +/// A 128-bit vector of [4 x float]. +/// \param __V2 +/// A 128-bit vector of [4 x float]. +/// \param __M +/// A 128-bit vector operand, with mask bits 127, 95, 63, and 31 specifying +/// how the values are to be copied. The position of the mask bit corresponds +/// to the most significant bit of a copied value. When a mask bit is 0, the +/// corresponding 32-bit element in operand \a __V1 is copied to the same +/// position in the result. When a mask bit is 1, the corresponding 32-bit +/// element in operand \a __V2 is copied to the same position in the result. +/// \returns A 128-bit vector of [4 x float] containing the copied values. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M) +{ + return (__m128) __builtin_ia32_blendvps ((__v4sf)__V1, (__v4sf)__V2, + (__v4sf)__M); +} + +/// Returns a 128-bit vector of [16 x i8] where the values are selected +/// from either of the first or second operand as specified by the third +/// operand, the control mask. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPBLENDVB / PBLENDVB instruction. +/// +/// \param __V1 +/// A 128-bit vector of [16 x i8]. +/// \param __V2 +/// A 128-bit vector of [16 x i8]. +/// \param __M +/// A 128-bit vector operand, with mask bits 127, 119, 111...7 specifying +/// how the values are to be copied. The position of the mask bit corresponds +/// to the most significant bit of a copied value. When a mask bit is 0, the +/// corresponding 8-bit element in operand \a __V1 is copied to the same +/// position in the result. When a mask bit is 1, the corresponding 8-bit +/// element in operand \a __V2 is copied to the same position in the result. +/// \returns A 128-bit vector of [16 x i8] containing the copied values. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M) +{ + return (__m128i) __builtin_ia32_pblendvb128 ((__v16qi)__V1, (__v16qi)__V2, + (__v16qi)__M); +} + +/// Returns a 128-bit vector of [8 x i16] where the values are selected +/// from either of the first or second operand as specified by the third +/// operand, the control mask. +/// +/// \headerfile +/// +/// \code +/// __m128i _mm_blend_epi16(__m128i V1, __m128i V2, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the VPBLENDW / PBLENDW instruction. +/// +/// \param V1 +/// A 128-bit vector of [8 x i16]. +/// \param V2 +/// A 128-bit vector of [8 x i16]. +/// \param M +/// An immediate integer operand, with mask bits [7:0] specifying how the +/// values are to be copied. The position of the mask bit corresponds to the +/// index of a copied value. When a mask bit is 0, the corresponding 16-bit +/// element in operand \a V1 is copied to the same position in the result. +/// When a mask bit is 1, the corresponding 16-bit element in operand \a V2 +/// is copied to the same position in the result. +/// \returns A 128-bit vector of [8 x i16] containing the copied values. +#define _mm_blend_epi16(V1, V2, M) \ + ((__m128i) __builtin_ia32_pblendw128 ((__v8hi)(__m128i)(V1), \ + (__v8hi)(__m128i)(V2), (int)(M))) + +/* SSE4 Dword Multiply Instructions. */ +/// Multiples corresponding elements of two 128-bit vectors of [4 x i32] +/// and returns the lower 32 bits of the each product in a 128-bit vector of +/// [4 x i32]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPMULLD / PMULLD instruction. +/// +/// \param __V1 +/// A 128-bit integer vector. +/// \param __V2 +/// A 128-bit integer vector. +/// \returns A 128-bit integer vector containing the products of both operands. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_mullo_epi32 (__m128i __V1, __m128i __V2) +{ + return (__m128i) ((__v4su)__V1 * (__v4su)__V2); +} + +/// Multiplies corresponding even-indexed elements of two 128-bit +/// vectors of [4 x i32] and returns a 128-bit vector of [2 x i64] +/// containing the products. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPMULDQ / PMULDQ instruction. +/// +/// \param __V1 +/// A 128-bit vector of [4 x i32]. +/// \param __V2 +/// A 128-bit vector of [4 x i32]. +/// \returns A 128-bit vector of [2 x i64] containing the products of both +/// operands. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_mul_epi32 (__m128i __V1, __m128i __V2) +{ + return (__m128i) __builtin_ia32_pmuldq128 ((__v4si)__V1, (__v4si)__V2); +} + +/* SSE4 Floating Point Dot Product Instructions. */ +/// Computes the dot product of the two 128-bit vectors of [4 x float] +/// and returns it in the elements of the 128-bit result vector of +/// [4 x float]. +/// +/// The immediate integer operand controls which input elements +/// will contribute to the dot product, and where the final results are +/// returned. +/// +/// \headerfile +/// +/// \code +/// __m128 _mm_dp_ps(__m128 X, __m128 Y, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the VDPPS / DPPS instruction. +/// +/// \param X +/// A 128-bit vector of [4 x float]. +/// \param Y +/// A 128-bit vector of [4 x float]. +/// \param M +/// An immediate integer operand. Mask bits [7:4] determine which elements +/// of the input vectors are used, with bit [4] corresponding to the lowest +/// element and bit [7] corresponding to the highest element of each [4 x +/// float] vector. If a bit is set, the corresponding elements from the two +/// input vectors are used as an input for dot product; otherwise that input +/// is treated as zero. Bits [3:0] determine which elements of the result +/// will receive a copy of the final dot product, with bit [0] corresponding +/// to the lowest element and bit [3] corresponding to the highest element of +/// each [4 x float] subvector. If a bit is set, the dot product is returned +/// in the corresponding element; otherwise that element is set to zero. +/// \returns A 128-bit vector of [4 x float] containing the dot product. +#define _mm_dp_ps(X, Y, M) \ + ((__m128) __builtin_ia32_dpps((__v4sf)(__m128)(X), \ + (__v4sf)(__m128)(Y), (M))) + +/// Computes the dot product of the two 128-bit vectors of [2 x double] +/// and returns it in the elements of the 128-bit result vector of +/// [2 x double]. +/// +/// The immediate integer operand controls which input +/// elements will contribute to the dot product, and where the final results +/// are returned. +/// +/// \headerfile +/// +/// \code +/// __m128d _mm_dp_pd(__m128d X, __m128d Y, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the VDPPD / DPPD instruction. +/// +/// \param X +/// A 128-bit vector of [2 x double]. +/// \param Y +/// A 128-bit vector of [2 x double]. +/// \param M +/// An immediate integer operand. Mask bits [5:4] determine which elements +/// of the input vectors are used, with bit [4] corresponding to the lowest +/// element and bit [5] corresponding to the highest element of each of [2 x +/// double] vector. If a bit is set, the corresponding elements from the two +/// input vectors are used as an input for dot product; otherwise that input +/// is treated as zero. Bits [1:0] determine which elements of the result +/// will receive a copy of the final dot product, with bit [0] corresponding +/// to the lowest element and bit [1] corresponding to the highest element of +/// each [2 x double] vector. If a bit is set, the dot product is returned in +/// the corresponding element; otherwise that element is set to zero. +#define _mm_dp_pd(X, Y, M) \ + ((__m128d) __builtin_ia32_dppd((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), (M))) + +/* SSE4 Streaming Load Hint Instruction. */ +/// Loads integer values from a 128-bit aligned memory location to a +/// 128-bit integer vector. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVNTDQA / MOVNTDQA instruction. +/// +/// \param __V +/// A pointer to a 128-bit aligned memory location that contains the integer +/// values. +/// \returns A 128-bit integer vector containing the data stored at the +/// specified memory location. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_stream_load_si128 (__m128i const *__V) +{ + return (__m128i) __builtin_nontemporal_load ((const __v2di *) __V); +} + +/* SSE4 Packed Integer Min/Max Instructions. */ +/// Compares the corresponding elements of two 128-bit vectors of +/// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the lesser +/// of the two values. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPMINSB / PMINSB instruction. +/// +/// \param __V1 +/// A 128-bit vector of [16 x i8]. +/// \param __V2 +/// A 128-bit vector of [16 x i8] +/// \returns A 128-bit vector of [16 x i8] containing the lesser values. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_min_epi8 (__m128i __V1, __m128i __V2) +{ +#if (__clang_major__ < 14) + return (__m128i) __builtin_ia32_pminsb128 ((__v16qi) __V1, (__v16qi) __V2); +#else + return (__m128i) __builtin_elementwise_min((__v16qs) __V1, (__v16qs) __V2); +#endif +} + +/// Compares the corresponding elements of two 128-bit vectors of +/// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the +/// greater value of the two. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPMAXSB / PMAXSB instruction. +/// +/// \param __V1 +/// A 128-bit vector of [16 x i8]. +/// \param __V2 +/// A 128-bit vector of [16 x i8]. +/// \returns A 128-bit vector of [16 x i8] containing the greater values. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_max_epi8 (__m128i __V1, __m128i __V2) +{ +#if (__clang_major__ < 14) + return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi) __V1, (__v16qi) __V2); +#else + return (__m128i) __builtin_elementwise_max((__v16qs) __V1, (__v16qs) __V2); +#endif +} + +/// Compares the corresponding elements of two 128-bit vectors of +/// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the lesser +/// value of the two. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPMINUW / PMINUW instruction. +/// +/// \param __V1 +/// A 128-bit vector of [8 x u16]. +/// \param __V2 +/// A 128-bit vector of [8 x u16]. +/// \returns A 128-bit vector of [8 x u16] containing the lesser values. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_min_epu16 (__m128i __V1, __m128i __V2) +{ +#if (__clang_major__ < 14) + return (__m128i) __builtin_ia32_pminuw128 ((__v8hi) __V1, (__v8hi) __V2); +#else + return (__m128i) __builtin_elementwise_min((__v8hu) __V1, (__v8hu) __V2); +#endif +} + +/// Compares the corresponding elements of two 128-bit vectors of +/// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the +/// greater value of the two. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPMAXUW / PMAXUW instruction. +/// +/// \param __V1 +/// A 128-bit vector of [8 x u16]. +/// \param __V2 +/// A 128-bit vector of [8 x u16]. +/// \returns A 128-bit vector of [8 x u16] containing the greater values. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_max_epu16 (__m128i __V1, __m128i __V2) +{ +#if (__clang_major__ < 14) + return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi) __V1, (__v8hi) __V2); +#else + return (__m128i) __builtin_elementwise_max((__v8hu) __V1, (__v8hu) __V2); +#endif +} + +/// Compares the corresponding elements of two 128-bit vectors of +/// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the lesser +/// value of the two. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPMINSD / PMINSD instruction. +/// +/// \param __V1 +/// A 128-bit vector of [4 x i32]. +/// \param __V2 +/// A 128-bit vector of [4 x i32]. +/// \returns A 128-bit vector of [4 x i32] containing the lesser values. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_min_epi32 (__m128i __V1, __m128i __V2) +{ +#if (__clang_major__ < 14) + return (__m128i) __builtin_ia32_pminsd128 ((__v4si) __V1, (__v4si) __V2); +#else + return (__m128i) __builtin_elementwise_min((__v4si) __V1, (__v4si) __V2); +#endif +} + +/// Compares the corresponding elements of two 128-bit vectors of +/// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the +/// greater value of the two. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPMAXSD / PMAXSD instruction. +/// +/// \param __V1 +/// A 128-bit vector of [4 x i32]. +/// \param __V2 +/// A 128-bit vector of [4 x i32]. +/// \returns A 128-bit vector of [4 x i32] containing the greater values. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_max_epi32 (__m128i __V1, __m128i __V2) +{ +#if (__clang_major__ < 14) + return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si) __V1, (__v4si) __V2); +#else + return (__m128i) __builtin_elementwise_max((__v4si) __V1, (__v4si) __V2); +#endif +} + +/// Compares the corresponding elements of two 128-bit vectors of +/// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the lesser +/// value of the two. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPMINUD / PMINUD instruction. +/// +/// \param __V1 +/// A 128-bit vector of [4 x u32]. +/// \param __V2 +/// A 128-bit vector of [4 x u32]. +/// \returns A 128-bit vector of [4 x u32] containing the lesser values. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_min_epu32 (__m128i __V1, __m128i __V2) +{ +#if (__clang_major__ < 14) + return (__m128i) __builtin_ia32_pminud128((__v4si) __V1, (__v4si) __V2); +#else + return (__m128i) __builtin_elementwise_min((__v4su) __V1, (__v4su) __V2); +#endif +} + +/// Compares the corresponding elements of two 128-bit vectors of +/// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the +/// greater value of the two. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPMAXUD / PMAXUD instruction. +/// +/// \param __V1 +/// A 128-bit vector of [4 x u32]. +/// \param __V2 +/// A 128-bit vector of [4 x u32]. +/// \returns A 128-bit vector of [4 x u32] containing the greater values. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_max_epu32 (__m128i __V1, __m128i __V2) +{ +#if (__clang_major__ < 14) + return (__m128i) __builtin_ia32_pmaxud128((__v4si) __V1, (__v4si) __V2); +#else + return (__m128i) __builtin_elementwise_max((__v4su) __V1, (__v4su) __V2); +#endif +} + +/* SSE4 Insertion and Extraction from XMM Register Instructions. */ +/// Takes the first argument \a X and inserts an element from the second +/// argument \a Y as selected by the third argument \a N. That result then +/// has elements zeroed out also as selected by the third argument \a N. The +/// resulting 128-bit vector of [4 x float] is then returned. +/// +/// \headerfile +/// +/// \code +/// __m128 _mm_insert_ps(__m128 X, __m128 Y, const int N); +/// \endcode +/// +/// This intrinsic corresponds to the VINSERTPS instruction. +/// +/// \param X +/// A 128-bit vector source operand of [4 x float]. With the exception of +/// those bits in the result copied from parameter \a Y and zeroed by bits +/// [3:0] of \a N, all bits from this parameter are copied to the result. +/// \param Y +/// A 128-bit vector source operand of [4 x float]. One single-precision +/// floating-point element from this source, as determined by the immediate +/// parameter, is copied to the result. +/// \param N +/// Specifies which bits from operand \a Y will be copied, which bits in the +/// result they will be be copied to, and which bits in the result will be +/// cleared. The following assignments are made: \n +/// Bits [7:6] specify the bits to copy from operand \a Y: \n +/// 00: Selects bits [31:0] from operand \a Y. \n +/// 01: Selects bits [63:32] from operand \a Y. \n +/// 10: Selects bits [95:64] from operand \a Y. \n +/// 11: Selects bits [127:96] from operand \a Y. \n +/// Bits [5:4] specify the bits in the result to which the selected bits +/// from operand \a Y are copied: \n +/// 00: Copies the selected bits from \a Y to result bits [31:0]. \n +/// 01: Copies the selected bits from \a Y to result bits [63:32]. \n +/// 10: Copies the selected bits from \a Y to result bits [95:64]. \n +/// 11: Copies the selected bits from \a Y to result bits [127:96]. \n +/// Bits[3:0]: If any of these bits are set, the corresponding result +/// element is cleared. +/// \returns A 128-bit vector of [4 x float] containing the copied +/// single-precision floating point elements from the operands. +#define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N)) + +/// Extracts a 32-bit integer from a 128-bit vector of [4 x float] and +/// returns it, using the immediate value parameter \a N as a selector. +/// +/// \headerfile +/// +/// \code +/// int _mm_extract_ps(__m128 X, const int N); +/// \endcode +/// +/// This intrinsic corresponds to the VEXTRACTPS / EXTRACTPS +/// instruction. +/// +/// \param X +/// A 128-bit vector of [4 x float]. +/// \param N +/// An immediate value. Bits [1:0] determines which bits from the argument +/// \a X are extracted and returned: \n +/// 00: Bits [31:0] of parameter \a X are returned. \n +/// 01: Bits [63:32] of parameter \a X are returned. \n +/// 10: Bits [95:64] of parameter \a X are returned. \n +/// 11: Bits [127:96] of parameter \a X are returned. +/// \returns A 32-bit integer containing the extracted 32 bits of float data. +#define _mm_extract_ps(X, N) \ + __builtin_bit_cast(int, __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N))) + +/* Miscellaneous insert and extract macros. */ +/* Extract a single-precision float from X at index N into D. */ +#define _MM_EXTRACT_FLOAT(D, X, N) \ + do { (D) = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); } while (0) + +/* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create + an index suitable for _mm_insert_ps. */ +#define _MM_MK_INSERTPS_NDX(X, Y, Z) (((X) << 6) | ((Y) << 4) | (Z)) + +/* Extract a float from X at index N into the first index of the return. */ +#define _MM_PICK_OUT_PS(X, N) _mm_insert_ps (_mm_setzero_ps(), (X), \ + _MM_MK_INSERTPS_NDX((N), 0, 0x0e)) + +/* Insert int into packed integer array at index. */ +/// Constructs a 128-bit vector of [16 x i8] by first making a copy of +/// the 128-bit integer vector parameter, and then inserting the lower 8 bits +/// of an integer parameter \a I into an offset specified by the immediate +/// value parameter \a N. +/// +/// \headerfile +/// +/// \code +/// __m128i _mm_insert_epi8(__m128i X, int I, const int N); +/// \endcode +/// +/// This intrinsic corresponds to the VPINSRB / PINSRB instruction. +/// +/// \param X +/// A 128-bit integer vector of [16 x i8]. This vector is copied to the +/// result and then one of the sixteen elements in the result vector is +/// replaced by the lower 8 bits of \a I. +/// \param I +/// An integer. The lower 8 bits of this operand are written to the result +/// beginning at the offset specified by \a N. +/// \param N +/// An immediate value. Bits [3:0] specify the bit offset in the result at +/// which the lower 8 bits of \a I are written. \n +/// 0000: Bits [7:0] of the result are used for insertion. \n +/// 0001: Bits [15:8] of the result are used for insertion. \n +/// 0010: Bits [23:16] of the result are used for insertion. \n +/// 0011: Bits [31:24] of the result are used for insertion. \n +/// 0100: Bits [39:32] of the result are used for insertion. \n +/// 0101: Bits [47:40] of the result are used for insertion. \n +/// 0110: Bits [55:48] of the result are used for insertion. \n +/// 0111: Bits [63:56] of the result are used for insertion. \n +/// 1000: Bits [71:64] of the result are used for insertion. \n +/// 1001: Bits [79:72] of the result are used for insertion. \n +/// 1010: Bits [87:80] of the result are used for insertion. \n +/// 1011: Bits [95:88] of the result are used for insertion. \n +/// 1100: Bits [103:96] of the result are used for insertion. \n +/// 1101: Bits [111:104] of the result are used for insertion. \n +/// 1110: Bits [119:112] of the result are used for insertion. \n +/// 1111: Bits [127:120] of the result are used for insertion. +/// \returns A 128-bit integer vector containing the constructed values. +#define _mm_insert_epi8(X, I, N) \ + ((__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)(__m128i)(X), \ + (int)(I), (int)(N))) + +/// Constructs a 128-bit vector of [4 x i32] by first making a copy of +/// the 128-bit integer vector parameter, and then inserting the 32-bit +/// integer parameter \a I at the offset specified by the immediate value +/// parameter \a N. +/// +/// \headerfile +/// +/// \code +/// __m128i _mm_insert_epi32(__m128i X, int I, const int N); +/// \endcode +/// +/// This intrinsic corresponds to the VPINSRD / PINSRD instruction. +/// +/// \param X +/// A 128-bit integer vector of [4 x i32]. This vector is copied to the +/// result and then one of the four elements in the result vector is +/// replaced by \a I. +/// \param I +/// A 32-bit integer that is written to the result beginning at the offset +/// specified by \a N. +/// \param N +/// An immediate value. Bits [1:0] specify the bit offset in the result at +/// which the integer \a I is written. \n +/// 00: Bits [31:0] of the result are used for insertion. \n +/// 01: Bits [63:32] of the result are used for insertion. \n +/// 10: Bits [95:64] of the result are used for insertion. \n +/// 11: Bits [127:96] of the result are used for insertion. +/// \returns A 128-bit integer vector containing the constructed values. +#define _mm_insert_epi32(X, I, N) \ + ((__m128i)__builtin_ia32_vec_set_v4si((__v4si)(__m128i)(X), \ + (int)(I), (int)(N))) + +#ifdef __x86_64__ +/// Constructs a 128-bit vector of [2 x i64] by first making a copy of +/// the 128-bit integer vector parameter, and then inserting the 64-bit +/// integer parameter \a I, using the immediate value parameter \a N as an +/// insertion location selector. +/// +/// \headerfile +/// +/// \code +/// __m128i _mm_insert_epi64(__m128i X, long long I, const int N); +/// \endcode +/// +/// This intrinsic corresponds to the VPINSRQ / PINSRQ instruction. +/// +/// \param X +/// A 128-bit integer vector of [2 x i64]. This vector is copied to the +/// result and then one of the two elements in the result vector is replaced +/// by \a I. +/// \param I +/// A 64-bit integer that is written to the result beginning at the offset +/// specified by \a N. +/// \param N +/// An immediate value. Bit [0] specifies the bit offset in the result at +/// which the integer \a I is written. \n +/// 0: Bits [63:0] of the result are used for insertion. \n +/// 1: Bits [127:64] of the result are used for insertion. \n +/// \returns A 128-bit integer vector containing the constructed values. +#define _mm_insert_epi64(X, I, N) \ + ((__m128i)__builtin_ia32_vec_set_v2di((__v2di)(__m128i)(X), \ + (long long)(I), (int)(N))) +#endif /* __x86_64__ */ + +/* Extract int from packed integer array at index. This returns the element + * as a zero extended value, so it is unsigned. + */ +/// Extracts an 8-bit element from the 128-bit integer vector of +/// [16 x i8], using the immediate value parameter \a N as a selector. +/// +/// \headerfile +/// +/// \code +/// int _mm_extract_epi8(__m128i X, const int N); +/// \endcode +/// +/// This intrinsic corresponds to the VPEXTRB / PEXTRB instruction. +/// +/// \param X +/// A 128-bit integer vector. +/// \param N +/// An immediate value. Bits [3:0] specify which 8-bit vector element from +/// the argument \a X to extract and copy to the result. \n +/// 0000: Bits [7:0] of parameter \a X are extracted. \n +/// 0001: Bits [15:8] of the parameter \a X are extracted. \n +/// 0010: Bits [23:16] of the parameter \a X are extracted. \n +/// 0011: Bits [31:24] of the parameter \a X are extracted. \n +/// 0100: Bits [39:32] of the parameter \a X are extracted. \n +/// 0101: Bits [47:40] of the parameter \a X are extracted. \n +/// 0110: Bits [55:48] of the parameter \a X are extracted. \n +/// 0111: Bits [63:56] of the parameter \a X are extracted. \n +/// 1000: Bits [71:64] of the parameter \a X are extracted. \n +/// 1001: Bits [79:72] of the parameter \a X are extracted. \n +/// 1010: Bits [87:80] of the parameter \a X are extracted. \n +/// 1011: Bits [95:88] of the parameter \a X are extracted. \n +/// 1100: Bits [103:96] of the parameter \a X are extracted. \n +/// 1101: Bits [111:104] of the parameter \a X are extracted. \n +/// 1110: Bits [119:112] of the parameter \a X are extracted. \n +/// 1111: Bits [127:120] of the parameter \a X are extracted. +/// \returns An unsigned integer, whose lower 8 bits are selected from the +/// 128-bit integer vector parameter and the remaining bits are assigned +/// zeros. +#define _mm_extract_epi8(X, N) \ + ((int)(unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)(__m128i)(X), \ + (int)(N))) + +/// Extracts a 32-bit element from the 128-bit integer vector of +/// [4 x i32], using the immediate value parameter \a N as a selector. +/// +/// \headerfile +/// +/// \code +/// int _mm_extract_epi32(__m128i X, const int N); +/// \endcode +/// +/// This intrinsic corresponds to the VPEXTRD / PEXTRD instruction. +/// +/// \param X +/// A 128-bit integer vector. +/// \param N +/// An immediate value. Bits [1:0] specify which 32-bit vector element from +/// the argument \a X to extract and copy to the result. \n +/// 00: Bits [31:0] of the parameter \a X are extracted. \n +/// 01: Bits [63:32] of the parameter \a X are extracted. \n +/// 10: Bits [95:64] of the parameter \a X are extracted. \n +/// 11: Bits [127:96] of the parameter \a X are exracted. +/// \returns An integer, whose lower 32 bits are selected from the 128-bit +/// integer vector parameter and the remaining bits are assigned zeros. +#define _mm_extract_epi32(X, N) \ + ((int)__builtin_ia32_vec_ext_v4si((__v4si)(__m128i)(X), (int)(N))) + +#ifdef __x86_64__ +/// Extracts a 64-bit element from the 128-bit integer vector of +/// [2 x i64], using the immediate value parameter \a N as a selector. +/// +/// \headerfile +/// +/// \code +/// long long _mm_extract_epi64(__m128i X, const int N); +/// \endcode +/// +/// This intrinsic corresponds to the VPEXTRQ / PEXTRQ instruction. +/// +/// \param X +/// A 128-bit integer vector. +/// \param N +/// An immediate value. Bit [0] specifies which 64-bit vector element from +/// the argument \a X to return. \n +/// 0: Bits [63:0] are returned. \n +/// 1: Bits [127:64] are returned. \n +/// \returns A 64-bit integer. +#define _mm_extract_epi64(X, N) \ + ((long long)__builtin_ia32_vec_ext_v2di((__v2di)(__m128i)(X), (int)(N))) +#endif /* __x86_64 */ + +/* SSE4 128-bit Packed Integer Comparisons. */ +/// Tests whether the specified bits in a 128-bit integer vector are all +/// zeros. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPTEST / PTEST instruction. +/// +/// \param __M +/// A 128-bit integer vector containing the bits to be tested. +/// \param __V +/// A 128-bit integer vector selecting which bits to test in operand \a __M. +/// \returns TRUE if the specified bits are all zeros; FALSE otherwise. +static __inline__ int __DEFAULT_FN_ATTRS +_mm_testz_si128(__m128i __M, __m128i __V) +{ + return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V); +} + +/// Tests whether the specified bits in a 128-bit integer vector are all +/// ones. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPTEST / PTEST instruction. +/// +/// \param __M +/// A 128-bit integer vector containing the bits to be tested. +/// \param __V +/// A 128-bit integer vector selecting which bits to test in operand \a __M. +/// \returns TRUE if the specified bits are all ones; FALSE otherwise. +static __inline__ int __DEFAULT_FN_ATTRS +_mm_testc_si128(__m128i __M, __m128i __V) +{ + return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V); +} + +/// Tests whether the specified bits in a 128-bit integer vector are +/// neither all zeros nor all ones. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPTEST / PTEST instruction. +/// +/// \param __M +/// A 128-bit integer vector containing the bits to be tested. +/// \param __V +/// A 128-bit integer vector selecting which bits to test in operand \a __M. +/// \returns TRUE if the specified bits are neither all zeros nor all ones; +/// FALSE otherwise. +static __inline__ int __DEFAULT_FN_ATTRS +_mm_testnzc_si128(__m128i __M, __m128i __V) +{ + return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V); +} + +/// Tests whether the specified bits in a 128-bit integer vector are all +/// ones. +/// +/// \headerfile +/// +/// \code +/// int _mm_test_all_ones(__m128i V); +/// \endcode +/// +/// This intrinsic corresponds to the VPTEST / PTEST instruction. +/// +/// \param V +/// A 128-bit integer vector containing the bits to be tested. +/// \returns TRUE if the bits specified in the operand are all set to 1; FALSE +/// otherwise. +#define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V))) + +/// Tests whether the specified bits in a 128-bit integer vector are +/// neither all zeros nor all ones. +/// +/// \headerfile +/// +/// \code +/// int _mm_test_mix_ones_zeros(__m128i M, __m128i V); +/// \endcode +/// +/// This intrinsic corresponds to the VPTEST / PTEST instruction. +/// +/// \param M +/// A 128-bit integer vector containing the bits to be tested. +/// \param V +/// A 128-bit integer vector selecting which bits to test in operand \a M. +/// \returns TRUE if the specified bits are neither all zeros nor all ones; +/// FALSE otherwise. +#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V)) + +/// Tests whether the specified bits in a 128-bit integer vector are all +/// zeros. +/// +/// \headerfile +/// +/// \code +/// int _mm_test_all_zeros(__m128i M, __m128i V); +/// \endcode +/// +/// This intrinsic corresponds to the VPTEST / PTEST instruction. +/// +/// \param M +/// A 128-bit integer vector containing the bits to be tested. +/// \param V +/// A 128-bit integer vector selecting which bits to test in operand \a M. +/// \returns TRUE if the specified bits are all zeros; FALSE otherwise. +#define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V)) + +/* SSE4 64-bit Packed Integer Comparisons. */ +/// Compares each of the corresponding 64-bit values of the 128-bit +/// integer vectors for equality. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPCMPEQQ / PCMPEQQ instruction. +/// +/// \param __V1 +/// A 128-bit integer vector. +/// \param __V2 +/// A 128-bit integer vector. +/// \returns A 128-bit integer vector containing the comparison results. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_cmpeq_epi64(__m128i __V1, __m128i __V2) +{ + return (__m128i)((__v2di)__V1 == (__v2di)__V2); +} + +/* SSE4 Packed Integer Sign-Extension. */ +/// Sign-extends each of the lower eight 8-bit integer elements of a +/// 128-bit vector of [16 x i8] to 16-bit values and returns them in a +/// 128-bit vector of [8 x i16]. The upper eight elements of the input vector +/// are unused. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPMOVSXBW / PMOVSXBW instruction. +/// +/// \param __V +/// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are sign- +/// extended to 16-bit values. +/// \returns A 128-bit vector of [8 x i16] containing the sign-extended values. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_cvtepi8_epi16(__m128i __V) +{ + /* This function always performs a signed extension, but __v16qi is a char + which may be signed or unsigned, so use __v16qs. */ + return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi); +} + +/// Sign-extends each of the lower four 8-bit integer elements of a +/// 128-bit vector of [16 x i8] to 32-bit values and returns them in a +/// 128-bit vector of [4 x i32]. The upper twelve elements of the input +/// vector are unused. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPMOVSXBD / PMOVSXBD instruction. +/// +/// \param __V +/// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are +/// sign-extended to 32-bit values. +/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_cvtepi8_epi32(__m128i __V) +{ + /* This function always performs a signed extension, but __v16qi is a char + which may be signed or unsigned, so use __v16qs. */ + return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si); +} + +/// Sign-extends each of the lower two 8-bit integer elements of a +/// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in +/// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input +/// vector are unused. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPMOVSXBQ / PMOVSXBQ instruction. +/// +/// \param __V +/// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are +/// sign-extended to 64-bit values. +/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_cvtepi8_epi64(__m128i __V) +{ + /* This function always performs a signed extension, but __v16qi is a char + which may be signed or unsigned, so use __v16qs. */ + return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di); +} + +/// Sign-extends each of the lower four 16-bit integer elements of a +/// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in +/// a 128-bit vector of [4 x i32]. The upper four elements of the input +/// vector are unused. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPMOVSXWD / PMOVSXWD instruction. +/// +/// \param __V +/// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are +/// sign-extended to 32-bit values. +/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_cvtepi16_epi32(__m128i __V) +{ + return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si); +} + +/// Sign-extends each of the lower two 16-bit integer elements of a +/// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in +/// a 128-bit vector of [2 x i64]. The upper six elements of the input +/// vector are unused. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPMOVSXWQ / PMOVSXWQ instruction. +/// +/// \param __V +/// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are +/// sign-extended to 64-bit values. +/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_cvtepi16_epi64(__m128i __V) +{ + return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di); +} + +/// Sign-extends each of the lower two 32-bit integer elements of a +/// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in +/// a 128-bit vector of [2 x i64]. The upper two elements of the input vector +/// are unused. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPMOVSXDQ / PMOVSXDQ instruction. +/// +/// \param __V +/// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are +/// sign-extended to 64-bit values. +/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_cvtepi32_epi64(__m128i __V) +{ + return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4si)__V, (__v4si)__V, 0, 1), __v2di); +} + +/* SSE4 Packed Integer Zero-Extension. */ +/// Zero-extends each of the lower eight 8-bit integer elements of a +/// 128-bit vector of [16 x i8] to 16-bit values and returns them in a +/// 128-bit vector of [8 x i16]. The upper eight elements of the input vector +/// are unused. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPMOVZXBW / PMOVZXBW instruction. +/// +/// \param __V +/// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are +/// zero-extended to 16-bit values. +/// \returns A 128-bit vector of [8 x i16] containing the zero-extended values. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_cvtepu8_epi16(__m128i __V) +{ + return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi); +} + +/// Zero-extends each of the lower four 8-bit integer elements of a +/// 128-bit vector of [16 x i8] to 32-bit values and returns them in a +/// 128-bit vector of [4 x i32]. The upper twelve elements of the input +/// vector are unused. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPMOVZXBD / PMOVZXBD instruction. +/// +/// \param __V +/// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are +/// zero-extended to 32-bit values. +/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_cvtepu8_epi32(__m128i __V) +{ + return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4si); +} + +/// Zero-extends each of the lower two 8-bit integer elements of a +/// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in +/// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input +/// vector are unused. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPMOVZXBQ / PMOVZXBQ instruction. +/// +/// \param __V +/// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are +/// zero-extended to 64-bit values. +/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_cvtepu8_epi64(__m128i __V) +{ + return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1), __v2di); +} + +/// Zero-extends each of the lower four 16-bit integer elements of a +/// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in +/// a 128-bit vector of [4 x i32]. The upper four elements of the input +/// vector are unused. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPMOVZXWD / PMOVZXWD instruction. +/// +/// \param __V +/// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are +/// zero-extended to 32-bit values. +/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_cvtepu16_epi32(__m128i __V) +{ + return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4si); +} + +/// Zero-extends each of the lower two 16-bit integer elements of a +/// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in +/// a 128-bit vector of [2 x i64]. The upper six elements of the input vector +/// are unused. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPMOVZXWQ / PMOVZXWQ instruction. +/// +/// \param __V +/// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are +/// zero-extended to 64-bit values. +/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_cvtepu16_epi64(__m128i __V) +{ + return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1), __v2di); +} + +/// Zero-extends each of the lower two 32-bit integer elements of a +/// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in +/// a 128-bit vector of [2 x i64]. The upper two elements of the input vector +/// are unused. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPMOVZXDQ / PMOVZXDQ instruction. +/// +/// \param __V +/// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are +/// zero-extended to 64-bit values. +/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_cvtepu32_epi64(__m128i __V) +{ + return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4su)__V, (__v4su)__V, 0, 1), __v2di); +} + +/* SSE4 Pack with Unsigned Saturation. */ +/// Converts 32-bit signed integers from both 128-bit integer vector +/// operands into 16-bit unsigned integers, and returns the packed result. +/// Values greater than 0xFFFF are saturated to 0xFFFF. Values less than +/// 0x0000 are saturated to 0x0000. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPACKUSDW / PACKUSDW instruction. +/// +/// \param __V1 +/// A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a +/// signed integer and is converted to a 16-bit unsigned integer with +/// saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values +/// less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values +/// are written to the lower 64 bits of the result. +/// \param __V2 +/// A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a +/// signed integer and is converted to a 16-bit unsigned integer with +/// saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values +/// less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values +/// are written to the higher 64 bits of the result. +/// \returns A 128-bit vector of [8 x i16] containing the converted values. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_packus_epi32(__m128i __V1, __m128i __V2) +{ + return (__m128i) __builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2); +} + +/* SSE4 Multiple Packed Sums of Absolute Difference. */ +/// Subtracts 8-bit unsigned integer values and computes the absolute +/// values of the differences to the corresponding bits in the destination. +/// Then sums of the absolute differences are returned according to the bit +/// fields in the immediate operand. +/// +/// \headerfile +/// +/// \code +/// __m128i _mm_mpsadbw_epu8(__m128i X, __m128i Y, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the VMPSADBW / MPSADBW instruction. +/// +/// \param X +/// A 128-bit vector of [16 x i8]. +/// \param Y +/// A 128-bit vector of [16 x i8]. +/// \param M +/// An 8-bit immediate operand specifying how the absolute differences are to +/// be calculated, according to the following algorithm: +/// \code +/// // M2 represents bit 2 of the immediate operand +/// // M10 represents bits [1:0] of the immediate operand +/// i = M2 * 4; +/// j = M10 * 4; +/// for (k = 0; k < 8; k = k + 1) { +/// d0 = abs(X[i + k + 0] - Y[j + 0]); +/// d1 = abs(X[i + k + 1] - Y[j + 1]); +/// d2 = abs(X[i + k + 2] - Y[j + 2]); +/// d3 = abs(X[i + k + 3] - Y[j + 3]); +/// r[k] = d0 + d1 + d2 + d3; +/// } +/// \endcode +/// \returns A 128-bit integer vector containing the sums of the sets of +/// absolute differences between both operands. +#define _mm_mpsadbw_epu8(X, Y, M) \ + ((__m128i) __builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \ + (__v16qi)(__m128i)(Y), (M))) + +/// Finds the minimum unsigned 16-bit element in the input 128-bit +/// vector of [8 x u16] and returns it and along with its index. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPHMINPOSUW / PHMINPOSUW +/// instruction. +/// +/// \param __V +/// A 128-bit vector of [8 x u16]. +/// \returns A 128-bit value where bits [15:0] contain the minimum value found +/// in parameter \a __V, bits [18:16] contain the index of the minimum value +/// and the remaining bits are set to 0. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_minpos_epu16(__m128i __V) +{ + return (__m128i) __builtin_ia32_phminposuw128((__v8hi)__V); +} + +/* Handle the sse4.2 definitions here. */ + +/* These definitions are normally in nmmintrin.h, but gcc puts them in here + so we'll do the same. */ + +#undef __DEFAULT_FN_ATTRS +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.2"))) + +/* These specify the type of data that we're comparing. */ +#define _SIDD_UBYTE_OPS 0x00 +#define _SIDD_UWORD_OPS 0x01 +#define _SIDD_SBYTE_OPS 0x02 +#define _SIDD_SWORD_OPS 0x03 + +/* These specify the type of comparison operation. */ +#define _SIDD_CMP_EQUAL_ANY 0x00 +#define _SIDD_CMP_RANGES 0x04 +#define _SIDD_CMP_EQUAL_EACH 0x08 +#define _SIDD_CMP_EQUAL_ORDERED 0x0c + +/* These macros specify the polarity of the operation. */ +#define _SIDD_POSITIVE_POLARITY 0x00 +#define _SIDD_NEGATIVE_POLARITY 0x10 +#define _SIDD_MASKED_POSITIVE_POLARITY 0x20 +#define _SIDD_MASKED_NEGATIVE_POLARITY 0x30 + +/* These macros are used in _mm_cmpXstri() to specify the return. */ +#define _SIDD_LEAST_SIGNIFICANT 0x00 +#define _SIDD_MOST_SIGNIFICANT 0x40 + +/* These macros are used in _mm_cmpXstri() to specify the return. */ +#define _SIDD_BIT_MASK 0x00 +#define _SIDD_UNIT_MASK 0x40 + +/* SSE4.2 Packed Comparison Intrinsics. */ +/// Uses the immediate operand \a M to perform a comparison of string +/// data with implicitly defined lengths that is contained in source operands +/// \a A and \a B. Returns a 128-bit integer vector representing the result +/// mask of the comparison. +/// +/// \headerfile +/// +/// \code +/// __m128i _mm_cmpistrm(__m128i A, __m128i B, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the VPCMPISTRM / PCMPISTRM +/// instruction. +/// +/// \param A +/// A 128-bit integer vector containing one of the source operands to be +/// compared. +/// \param B +/// A 128-bit integer vector containing one of the source operands to be +/// compared. +/// \param M +/// An 8-bit immediate operand specifying whether the characters are bytes or +/// words, the type of comparison to perform, and the format of the return +/// value. \n +/// Bits [1:0]: Determine source data format. \n +/// 00: 16 unsigned bytes \n +/// 01: 8 unsigned words \n +/// 10: 16 signed bytes \n +/// 11: 8 signed words \n +/// Bits [3:2]: Determine comparison type and aggregation method. \n +/// 00: Subset: Each character in \a B is compared for equality with all +/// the characters in \a A. \n +/// 01: Ranges: Each character in \a B is compared to \a A. The comparison +/// basis is greater than or equal for even-indexed elements in \a A, +/// and less than or equal for odd-indexed elements in \a A. \n +/// 10: Match: Compare each pair of corresponding characters in \a A and +/// \a B for equality. \n +/// 11: Substring: Search \a B for substring matches of \a A. \n +/// Bits [5:4]: Determine whether to perform a one's complement on the bit +/// mask of the comparison results. \n +/// 00: No effect. \n +/// 01: Negate the bit mask. \n +/// 10: No effect. \n +/// 11: Negate the bit mask only for bits with an index less than or equal +/// to the size of \a A or \a B. \n +/// Bit [6]: Determines whether the result is zero-extended or expanded to 16 +/// bytes. \n +/// 0: The result is zero-extended to 16 bytes. \n +/// 1: The result is expanded to 16 bytes (this expansion is performed by +/// repeating each bit 8 or 16 times). +/// \returns Returns a 128-bit integer vector representing the result mask of +/// the comparison. +#define _mm_cmpistrm(A, B, M) \ + ((__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \ + (__v16qi)(__m128i)(B), (int)(M))) + +/// Uses the immediate operand \a M to perform a comparison of string +/// data with implicitly defined lengths that is contained in source operands +/// \a A and \a B. Returns an integer representing the result index of the +/// comparison. +/// +/// \headerfile +/// +/// \code +/// int _mm_cmpistri(__m128i A, __m128i B, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the VPCMPISTRI / PCMPISTRI +/// instruction. +/// +/// \param A +/// A 128-bit integer vector containing one of the source operands to be +/// compared. +/// \param B +/// A 128-bit integer vector containing one of the source operands to be +/// compared. +/// \param M +/// An 8-bit immediate operand specifying whether the characters are bytes or +/// words, the type of comparison to perform, and the format of the return +/// value. \n +/// Bits [1:0]: Determine source data format. \n +/// 00: 16 unsigned bytes \n +/// 01: 8 unsigned words \n +/// 10: 16 signed bytes \n +/// 11: 8 signed words \n +/// Bits [3:2]: Determine comparison type and aggregation method. \n +/// 00: Subset: Each character in \a B is compared for equality with all +/// the characters in \a A. \n +/// 01: Ranges: Each character in \a B is compared to \a A. The comparison +/// basis is greater than or equal for even-indexed elements in \a A, +/// and less than or equal for odd-indexed elements in \a A. \n +/// 10: Match: Compare each pair of corresponding characters in \a A and +/// \a B for equality. \n +/// 11: Substring: Search B for substring matches of \a A. \n +/// Bits [5:4]: Determine whether to perform a one's complement on the bit +/// mask of the comparison results. \n +/// 00: No effect. \n +/// 01: Negate the bit mask. \n +/// 10: No effect. \n +/// 11: Negate the bit mask only for bits with an index less than or equal +/// to the size of \a A or \a B. \n +/// Bit [6]: Determines whether the index of the lowest set bit or the +/// highest set bit is returned. \n +/// 0: The index of the least significant set bit. \n +/// 1: The index of the most significant set bit. \n +/// \returns Returns an integer representing the result index of the comparison. +#define _mm_cmpistri(A, B, M) \ + ((int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \ + (__v16qi)(__m128i)(B), (int)(M))) + +/// Uses the immediate operand \a M to perform a comparison of string +/// data with explicitly defined lengths that is contained in source operands +/// \a A and \a B. Returns a 128-bit integer vector representing the result +/// mask of the comparison. +/// +/// \headerfile +/// +/// \code +/// __m128i _mm_cmpestrm(__m128i A, int LA, __m128i B, int LB, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the VPCMPESTRM / PCMPESTRM +/// instruction. +/// +/// \param A +/// A 128-bit integer vector containing one of the source operands to be +/// compared. +/// \param LA +/// An integer that specifies the length of the string in \a A. +/// \param B +/// A 128-bit integer vector containing one of the source operands to be +/// compared. +/// \param LB +/// An integer that specifies the length of the string in \a B. +/// \param M +/// An 8-bit immediate operand specifying whether the characters are bytes or +/// words, the type of comparison to perform, and the format of the return +/// value. \n +/// Bits [1:0]: Determine source data format. \n +/// 00: 16 unsigned bytes \n +/// 01: 8 unsigned words \n +/// 10: 16 signed bytes \n +/// 11: 8 signed words \n +/// Bits [3:2]: Determine comparison type and aggregation method. \n +/// 00: Subset: Each character in \a B is compared for equality with all +/// the characters in \a A. \n +/// 01: Ranges: Each character in \a B is compared to \a A. The comparison +/// basis is greater than or equal for even-indexed elements in \a A, +/// and less than or equal for odd-indexed elements in \a A. \n +/// 10: Match: Compare each pair of corresponding characters in \a A and +/// \a B for equality. \n +/// 11: Substring: Search \a B for substring matches of \a A. \n +/// Bits [5:4]: Determine whether to perform a one's complement on the bit +/// mask of the comparison results. \n +/// 00: No effect. \n +/// 01: Negate the bit mask. \n +/// 10: No effect. \n +/// 11: Negate the bit mask only for bits with an index less than or equal +/// to the size of \a A or \a B. \n +/// Bit [6]: Determines whether the result is zero-extended or expanded to 16 +/// bytes. \n +/// 0: The result is zero-extended to 16 bytes. \n +/// 1: The result is expanded to 16 bytes (this expansion is performed by +/// repeating each bit 8 or 16 times). \n +/// \returns Returns a 128-bit integer vector representing the result mask of +/// the comparison. +#define _mm_cmpestrm(A, LA, B, LB, M) \ + ((__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \ + (__v16qi)(__m128i)(B), (int)(LB), \ + (int)(M))) + +/// Uses the immediate operand \a M to perform a comparison of string +/// data with explicitly defined lengths that is contained in source operands +/// \a A and \a B. Returns an integer representing the result index of the +/// comparison. +/// +/// \headerfile +/// +/// \code +/// int _mm_cmpestri(__m128i A, int LA, __m128i B, int LB, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the VPCMPESTRI / PCMPESTRI +/// instruction. +/// +/// \param A +/// A 128-bit integer vector containing one of the source operands to be +/// compared. +/// \param LA +/// An integer that specifies the length of the string in \a A. +/// \param B +/// A 128-bit integer vector containing one of the source operands to be +/// compared. +/// \param LB +/// An integer that specifies the length of the string in \a B. +/// \param M +/// An 8-bit immediate operand specifying whether the characters are bytes or +/// words, the type of comparison to perform, and the format of the return +/// value. \n +/// Bits [1:0]: Determine source data format. \n +/// 00: 16 unsigned bytes \n +/// 01: 8 unsigned words \n +/// 10: 16 signed bytes \n +/// 11: 8 signed words \n +/// Bits [3:2]: Determine comparison type and aggregation method. \n +/// 00: Subset: Each character in \a B is compared for equality with all +/// the characters in \a A. \n +/// 01: Ranges: Each character in \a B is compared to \a A. The comparison +/// basis is greater than or equal for even-indexed elements in \a A, +/// and less than or equal for odd-indexed elements in \a A. \n +/// 10: Match: Compare each pair of corresponding characters in \a A and +/// \a B for equality. \n +/// 11: Substring: Search B for substring matches of \a A. \n +/// Bits [5:4]: Determine whether to perform a one's complement on the bit +/// mask of the comparison results. \n +/// 00: No effect. \n +/// 01: Negate the bit mask. \n +/// 10: No effect. \n +/// 11: Negate the bit mask only for bits with an index less than or equal +/// to the size of \a A or \a B. \n +/// Bit [6]: Determines whether the index of the lowest set bit or the +/// highest set bit is returned. \n +/// 0: The index of the least significant set bit. \n +/// 1: The index of the most significant set bit. \n +/// \returns Returns an integer representing the result index of the comparison. +#define _mm_cmpestri(A, LA, B, LB, M) \ + ((int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \ + (__v16qi)(__m128i)(B), (int)(LB), \ + (int)(M))) + +/* SSE4.2 Packed Comparison Intrinsics and EFlag Reading. */ +/// Uses the immediate operand \a M to perform a comparison of string +/// data with implicitly defined lengths that is contained in source operands +/// \a A and \a B. Returns 1 if the bit mask is zero and the length of the +/// string in \a B is the maximum, otherwise, returns 0. +/// +/// \headerfile +/// +/// \code +/// int _mm_cmpistra(__m128i A, __m128i B, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the VPCMPISTRI / PCMPISTRI +/// instruction. +/// +/// \param A +/// A 128-bit integer vector containing one of the source operands to be +/// compared. +/// \param B +/// A 128-bit integer vector containing one of the source operands to be +/// compared. +/// \param M +/// An 8-bit immediate operand specifying whether the characters are bytes or +/// words and the type of comparison to perform. \n +/// Bits [1:0]: Determine source data format. \n +/// 00: 16 unsigned bytes \n +/// 01: 8 unsigned words \n +/// 10: 16 signed bytes \n +/// 11: 8 signed words \n +/// Bits [3:2]: Determine comparison type and aggregation method. \n +/// 00: Subset: Each character in \a B is compared for equality with all +/// the characters in \a A. \n +/// 01: Ranges: Each character in \a B is compared to \a A. The comparison +/// basis is greater than or equal for even-indexed elements in \a A, +/// and less than or equal for odd-indexed elements in \a A. \n +/// 10: Match: Compare each pair of corresponding characters in \a A and +/// \a B for equality. \n +/// 11: Substring: Search \a B for substring matches of \a A. \n +/// Bits [5:4]: Determine whether to perform a one's complement on the bit +/// mask of the comparison results. \n +/// 00: No effect. \n +/// 01: Negate the bit mask. \n +/// 10: No effect. \n +/// 11: Negate the bit mask only for bits with an index less than or equal +/// to the size of \a A or \a B. \n +/// \returns Returns 1 if the bit mask is zero and the length of the string in +/// \a B is the maximum; otherwise, returns 0. +#define _mm_cmpistra(A, B, M) \ + ((int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \ + (__v16qi)(__m128i)(B), (int)(M))) + +/// Uses the immediate operand \a M to perform a comparison of string +/// data with implicitly defined lengths that is contained in source operands +/// \a A and \a B. Returns 1 if the bit mask is non-zero, otherwise, returns +/// 0. +/// +/// \headerfile +/// +/// \code +/// int _mm_cmpistrc(__m128i A, __m128i B, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the VPCMPISTRI / PCMPISTRI +/// instruction. +/// +/// \param A +/// A 128-bit integer vector containing one of the source operands to be +/// compared. +/// \param B +/// A 128-bit integer vector containing one of the source operands to be +/// compared. +/// \param M +/// An 8-bit immediate operand specifying whether the characters are bytes or +/// words and the type of comparison to perform. \n +/// Bits [1:0]: Determine source data format. \n +/// 00: 16 unsigned bytes \n +/// 01: 8 unsigned words \n +/// 10: 16 signed bytes \n +/// 11: 8 signed words \n +/// Bits [3:2]: Determine comparison type and aggregation method. \n +/// 00: Subset: Each character in \a B is compared for equality with all +/// the characters in \a A. \n +/// 01: Ranges: Each character in \a B is compared to \a A. The comparison +/// basis is greater than or equal for even-indexed elements in \a A, +/// and less than or equal for odd-indexed elements in \a A. \n +/// 10: Match: Compare each pair of corresponding characters in \a A and +/// \a B for equality. \n +/// 11: Substring: Search B for substring matches of \a A. \n +/// Bits [5:4]: Determine whether to perform a one's complement on the bit +/// mask of the comparison results. \n +/// 00: No effect. \n +/// 01: Negate the bit mask. \n +/// 10: No effect. \n +/// 11: Negate the bit mask only for bits with an index less than or equal +/// to the size of \a A or \a B. +/// \returns Returns 1 if the bit mask is non-zero, otherwise, returns 0. +#define _mm_cmpistrc(A, B, M) \ + ((int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \ + (__v16qi)(__m128i)(B), (int)(M))) + +/// Uses the immediate operand \a M to perform a comparison of string +/// data with implicitly defined lengths that is contained in source operands +/// \a A and \a B. Returns bit 0 of the resulting bit mask. +/// +/// \headerfile +/// +/// \code +/// int _mm_cmpistro(__m128i A, __m128i B, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the VPCMPISTRI / PCMPISTRI +/// instruction. +/// +/// \param A +/// A 128-bit integer vector containing one of the source operands to be +/// compared. +/// \param B +/// A 128-bit integer vector containing one of the source operands to be +/// compared. +/// \param M +/// An 8-bit immediate operand specifying whether the characters are bytes or +/// words and the type of comparison to perform. \n +/// Bits [1:0]: Determine source data format. \n +/// 00: 16 unsigned bytes \n +/// 01: 8 unsigned words \n +/// 10: 16 signed bytes \n +/// 11: 8 signed words \n +/// Bits [3:2]: Determine comparison type and aggregation method. \n +/// 00: Subset: Each character in \a B is compared for equality with all +/// the characters in \a A. \n +/// 01: Ranges: Each character in \a B is compared to \a A. The comparison +/// basis is greater than or equal for even-indexed elements in \a A, +/// and less than or equal for odd-indexed elements in \a A. \n +/// 10: Match: Compare each pair of corresponding characters in \a A and +/// \a B for equality. \n +/// 11: Substring: Search B for substring matches of \a A. \n +/// Bits [5:4]: Determine whether to perform a one's complement on the bit +/// mask of the comparison results. \n +/// 00: No effect. \n +/// 01: Negate the bit mask. \n +/// 10: No effect. \n +/// 11: Negate the bit mask only for bits with an index less than or equal +/// to the size of \a A or \a B. \n +/// \returns Returns bit 0 of the resulting bit mask. +#define _mm_cmpistro(A, B, M) \ + ((int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \ + (__v16qi)(__m128i)(B), (int)(M))) + +/// Uses the immediate operand \a M to perform a comparison of string +/// data with implicitly defined lengths that is contained in source operands +/// \a A and \a B. Returns 1 if the length of the string in \a A is less than +/// the maximum, otherwise, returns 0. +/// +/// \headerfile +/// +/// \code +/// int _mm_cmpistrs(__m128i A, __m128i B, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the VPCMPISTRI / PCMPISTRI +/// instruction. +/// +/// \param A +/// A 128-bit integer vector containing one of the source operands to be +/// compared. +/// \param B +/// A 128-bit integer vector containing one of the source operands to be +/// compared. +/// \param M +/// An 8-bit immediate operand specifying whether the characters are bytes or +/// words and the type of comparison to perform. \n +/// Bits [1:0]: Determine source data format. \n +/// 00: 16 unsigned bytes \n +/// 01: 8 unsigned words \n +/// 10: 16 signed bytes \n +/// 11: 8 signed words \n +/// Bits [3:2]: Determine comparison type and aggregation method. \n +/// 00: Subset: Each character in \a B is compared for equality with all +/// the characters in \a A. \n +/// 01: Ranges: Each character in \a B is compared to \a A. The comparison +/// basis is greater than or equal for even-indexed elements in \a A, +/// and less than or equal for odd-indexed elements in \a A. \n +/// 10: Match: Compare each pair of corresponding characters in \a A and +/// \a B for equality. \n +/// 11: Substring: Search \a B for substring matches of \a A. \n +/// Bits [5:4]: Determine whether to perform a one's complement on the bit +/// mask of the comparison results. \n +/// 00: No effect. \n +/// 01: Negate the bit mask. \n +/// 10: No effect. \n +/// 11: Negate the bit mask only for bits with an index less than or equal +/// to the size of \a A or \a B. \n +/// \returns Returns 1 if the length of the string in \a A is less than the +/// maximum, otherwise, returns 0. +#define _mm_cmpistrs(A, B, M) \ + ((int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \ + (__v16qi)(__m128i)(B), (int)(M))) + +/// Uses the immediate operand \a M to perform a comparison of string +/// data with implicitly defined lengths that is contained in source operands +/// \a A and \a B. Returns 1 if the length of the string in \a B is less than +/// the maximum, otherwise, returns 0. +/// +/// \headerfile +/// +/// \code +/// int _mm_cmpistrz(__m128i A, __m128i B, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the VPCMPISTRI / PCMPISTRI +/// instruction. +/// +/// \param A +/// A 128-bit integer vector containing one of the source operands to be +/// compared. +/// \param B +/// A 128-bit integer vector containing one of the source operands to be +/// compared. +/// \param M +/// An 8-bit immediate operand specifying whether the characters are bytes or +/// words and the type of comparison to perform. \n +/// Bits [1:0]: Determine source data format. \n +/// 00: 16 unsigned bytes \n +/// 01: 8 unsigned words \n +/// 10: 16 signed bytes \n +/// 11: 8 signed words \n +/// Bits [3:2]: Determine comparison type and aggregation method. \n +/// 00: Subset: Each character in \a B is compared for equality with all +/// the characters in \a A. \n +/// 01: Ranges: Each character in \a B is compared to \a A. The comparison +/// basis is greater than or equal for even-indexed elements in \a A, +/// and less than or equal for odd-indexed elements in \a A. \n +/// 10: Match: Compare each pair of corresponding characters in \a A and +/// \a B for equality. \n +/// 11: Substring: Search \a B for substring matches of \a A. \n +/// Bits [5:4]: Determine whether to perform a one's complement on the bit +/// mask of the comparison results. \n +/// 00: No effect. \n +/// 01: Negate the bit mask. \n +/// 10: No effect. \n +/// 11: Negate the bit mask only for bits with an index less than or equal +/// to the size of \a A or \a B. +/// \returns Returns 1 if the length of the string in \a B is less than the +/// maximum, otherwise, returns 0. +#define _mm_cmpistrz(A, B, M) \ + ((int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \ + (__v16qi)(__m128i)(B), (int)(M))) + +/// Uses the immediate operand \a M to perform a comparison of string +/// data with explicitly defined lengths that is contained in source operands +/// \a A and \a B. Returns 1 if the bit mask is zero and the length of the +/// string in \a B is the maximum, otherwise, returns 0. +/// +/// \headerfile +/// +/// \code +/// int _mm_cmpestra(__m128i A, int LA, __m128i B, int LB, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the VPCMPESTRI / PCMPESTRI +/// instruction. +/// +/// \param A +/// A 128-bit integer vector containing one of the source operands to be +/// compared. +/// \param LA +/// An integer that specifies the length of the string in \a A. +/// \param B +/// A 128-bit integer vector containing one of the source operands to be +/// compared. +/// \param LB +/// An integer that specifies the length of the string in \a B. +/// \param M +/// An 8-bit immediate operand specifying whether the characters are bytes or +/// words and the type of comparison to perform. \n +/// Bits [1:0]: Determine source data format. \n +/// 00: 16 unsigned bytes \n +/// 01: 8 unsigned words \n +/// 10: 16 signed bytes \n +/// 11: 8 signed words \n +/// Bits [3:2]: Determine comparison type and aggregation method. \n +/// 00: Subset: Each character in \a B is compared for equality with all +/// the characters in \a A. \n +/// 01: Ranges: Each character in \a B is compared to \a A. The comparison +/// basis is greater than or equal for even-indexed elements in \a A, +/// and less than or equal for odd-indexed elements in \a A. \n +/// 10: Match: Compare each pair of corresponding characters in \a A and +/// \a B for equality. \n +/// 11: Substring: Search \a B for substring matches of \a A. \n +/// Bits [5:4]: Determine whether to perform a one's complement on the bit +/// mask of the comparison results. \n +/// 00: No effect. \n +/// 01: Negate the bit mask. \n +/// 10: No effect. \n +/// 11: Negate the bit mask only for bits with an index less than or equal +/// to the size of \a A or \a B. +/// \returns Returns 1 if the bit mask is zero and the length of the string in +/// \a B is the maximum, otherwise, returns 0. +#define _mm_cmpestra(A, LA, B, LB, M) \ + ((int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \ + (__v16qi)(__m128i)(B), (int)(LB), \ + (int)(M))) + +/// Uses the immediate operand \a M to perform a comparison of string +/// data with explicitly defined lengths that is contained in source operands +/// \a A and \a B. Returns 1 if the resulting mask is non-zero, otherwise, +/// returns 0. +/// +/// \headerfile +/// +/// \code +/// int _mm_cmpestrc(__m128i A, int LA, __m128i B, int LB, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the VPCMPESTRI / PCMPESTRI +/// instruction. +/// +/// \param A +/// A 128-bit integer vector containing one of the source operands to be +/// compared. +/// \param LA +/// An integer that specifies the length of the string in \a A. +/// \param B +/// A 128-bit integer vector containing one of the source operands to be +/// compared. +/// \param LB +/// An integer that specifies the length of the string in \a B. +/// \param M +/// An 8-bit immediate operand specifying whether the characters are bytes or +/// words and the type of comparison to perform. \n +/// Bits [1:0]: Determine source data format. \n +/// 00: 16 unsigned bytes \n +/// 01: 8 unsigned words \n +/// 10: 16 signed bytes \n +/// 11: 8 signed words \n +/// Bits [3:2]: Determine comparison type and aggregation method. \n +/// 00: Subset: Each character in \a B is compared for equality with all +/// the characters in \a A. \n +/// 01: Ranges: Each character in \a B is compared to \a A. The comparison +/// basis is greater than or equal for even-indexed elements in \a A, +/// and less than or equal for odd-indexed elements in \a A. \n +/// 10: Match: Compare each pair of corresponding characters in \a A and +/// \a B for equality. \n +/// 11: Substring: Search \a B for substring matches of \a A. \n +/// Bits [5:4]: Determine whether to perform a one's complement on the bit +/// mask of the comparison results. \n +/// 00: No effect. \n +/// 01: Negate the bit mask. \n +/// 10: No effect. \n +/// 11: Negate the bit mask only for bits with an index less than or equal +/// to the size of \a A or \a B. \n +/// \returns Returns 1 if the resulting mask is non-zero, otherwise, returns 0. +#define _mm_cmpestrc(A, LA, B, LB, M) \ + ((int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \ + (__v16qi)(__m128i)(B), (int)(LB), \ + (int)(M))) + +/// Uses the immediate operand \a M to perform a comparison of string +/// data with explicitly defined lengths that is contained in source operands +/// \a A and \a B. Returns bit 0 of the resulting bit mask. +/// +/// \headerfile +/// +/// \code +/// int _mm_cmpestro(__m128i A, int LA, __m128i B, int LB, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the VPCMPESTRI / PCMPESTRI +/// instruction. +/// +/// \param A +/// A 128-bit integer vector containing one of the source operands to be +/// compared. +/// \param LA +/// An integer that specifies the length of the string in \a A. +/// \param B +/// A 128-bit integer vector containing one of the source operands to be +/// compared. +/// \param LB +/// An integer that specifies the length of the string in \a B. +/// \param M +/// An 8-bit immediate operand specifying whether the characters are bytes or +/// words and the type of comparison to perform. \n +/// Bits [1:0]: Determine source data format. \n +/// 00: 16 unsigned bytes \n +/// 01: 8 unsigned words \n +/// 10: 16 signed bytes \n +/// 11: 8 signed words \n +/// Bits [3:2]: Determine comparison type and aggregation method. \n +/// 00: Subset: Each character in \a B is compared for equality with all +/// the characters in \a A. \n +/// 01: Ranges: Each character in \a B is compared to \a A. The comparison +/// basis is greater than or equal for even-indexed elements in \a A, +/// and less than or equal for odd-indexed elements in \a A. \n +/// 10: Match: Compare each pair of corresponding characters in \a A and +/// \a B for equality. \n +/// 11: Substring: Search \a B for substring matches of \a A. \n +/// Bits [5:4]: Determine whether to perform a one's complement on the bit +/// mask of the comparison results. \n +/// 00: No effect. \n +/// 01: Negate the bit mask. \n +/// 10: No effect. \n +/// 11: Negate the bit mask only for bits with an index less than or equal +/// to the size of \a A or \a B. +/// \returns Returns bit 0 of the resulting bit mask. +#define _mm_cmpestro(A, LA, B, LB, M) \ + ((int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \ + (__v16qi)(__m128i)(B), (int)(LB), \ + (int)(M))) + +/// Uses the immediate operand \a M to perform a comparison of string +/// data with explicitly defined lengths that is contained in source operands +/// \a A and \a B. Returns 1 if the length of the string in \a A is less than +/// the maximum, otherwise, returns 0. +/// +/// \headerfile +/// +/// \code +/// int _mm_cmpestrs(__m128i A, int LA, __m128i B, int LB, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the VPCMPESTRI / PCMPESTRI +/// instruction. +/// +/// \param A +/// A 128-bit integer vector containing one of the source operands to be +/// compared. +/// \param LA +/// An integer that specifies the length of the string in \a A. +/// \param B +/// A 128-bit integer vector containing one of the source operands to be +/// compared. +/// \param LB +/// An integer that specifies the length of the string in \a B. +/// \param M +/// An 8-bit immediate operand specifying whether the characters are bytes or +/// words and the type of comparison to perform. \n +/// Bits [1:0]: Determine source data format. \n +/// 00: 16 unsigned bytes \n +/// 01: 8 unsigned words \n +/// 10: 16 signed bytes \n +/// 11: 8 signed words \n +/// Bits [3:2]: Determine comparison type and aggregation method. \n +/// 00: Subset: Each character in \a B is compared for equality with all +/// the characters in \a A. \n +/// 01: Ranges: Each character in \a B is compared to \a A. The comparison +/// basis is greater than or equal for even-indexed elements in \a A, +/// and less than or equal for odd-indexed elements in \a A. \n +/// 10: Match: Compare each pair of corresponding characters in \a A and +/// \a B for equality. \n +/// 11: Substring: Search \a B for substring matches of \a A. \n +/// Bits [5:4]: Determine whether to perform a one's complement in the bit +/// mask of the comparison results. \n +/// 00: No effect. \n +/// 01: Negate the bit mask. \n +/// 10: No effect. \n +/// 11: Negate the bit mask only for bits with an index less than or equal +/// to the size of \a A or \a B. \n +/// \returns Returns 1 if the length of the string in \a A is less than the +/// maximum, otherwise, returns 0. +#define _mm_cmpestrs(A, LA, B, LB, M) \ + ((int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \ + (__v16qi)(__m128i)(B), (int)(LB), \ + (int)(M))) + +/// Uses the immediate operand \a M to perform a comparison of string +/// data with explicitly defined lengths that is contained in source operands +/// \a A and \a B. Returns 1 if the length of the string in \a B is less than +/// the maximum, otherwise, returns 0. +/// +/// \headerfile +/// +/// \code +/// int _mm_cmpestrz(__m128i A, int LA, __m128i B, int LB, const int M); +/// \endcode +/// +/// This intrinsic corresponds to the VPCMPESTRI instruction. +/// +/// \param A +/// A 128-bit integer vector containing one of the source operands to be +/// compared. +/// \param LA +/// An integer that specifies the length of the string in \a A. +/// \param B +/// A 128-bit integer vector containing one of the source operands to be +/// compared. +/// \param LB +/// An integer that specifies the length of the string in \a B. +/// \param M +/// An 8-bit immediate operand specifying whether the characters are bytes or +/// words and the type of comparison to perform. \n +/// Bits [1:0]: Determine source data format. \n +/// 00: 16 unsigned bytes \n +/// 01: 8 unsigned words \n +/// 10: 16 signed bytes \n +/// 11: 8 signed words \n +/// Bits [3:2]: Determine comparison type and aggregation method. \n +/// 00: Subset: Each character in \a B is compared for equality with all +/// the characters in \a A. \n +/// 01: Ranges: Each character in \a B is compared to \a A. The comparison +/// basis is greater than or equal for even-indexed elements in \a A, +/// and less than or equal for odd-indexed elements in \a A. \n +/// 10: Match: Compare each pair of corresponding characters in \a A and +/// \a B for equality. \n +/// 11: Substring: Search \a B for substring matches of \a A. \n +/// Bits [5:4]: Determine whether to perform a one's complement on the bit +/// mask of the comparison results. \n +/// 00: No effect. \n +/// 01: Negate the bit mask. \n +/// 10: No effect. \n +/// 11: Negate the bit mask only for bits with an index less than or equal +/// to the size of \a A or \a B. +/// \returns Returns 1 if the length of the string in \a B is less than the +/// maximum, otherwise, returns 0. +#define _mm_cmpestrz(A, LA, B, LB, M) \ + ((int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \ + (__v16qi)(__m128i)(B), (int)(LB), \ + (int)(M))) + +/* SSE4.2 Compare Packed Data -- Greater Than. */ +/// Compares each of the corresponding 64-bit values of the 128-bit +/// integer vectors to determine if the values in the first operand are +/// greater than those in the second operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPCMPGTQ / PCMPGTQ instruction. +/// +/// \param __V1 +/// A 128-bit integer vector. +/// \param __V2 +/// A 128-bit integer vector. +/// \returns A 128-bit integer vector containing the comparison results. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_cmpgt_epi64(__m128i __V1, __m128i __V2) +{ + return (__m128i)((__v2di)__V1 > (__v2di)__V2); +} + +#undef __DEFAULT_FN_ATTRS + +#include + +#include + +#endif /* __SMMINTRIN_H */ diff --git a/include-llvm/tbmintrin.h b/include-llvm/tbmintrin.h new file mode 100644 index 0000000..f4e848a --- /dev/null +++ b/include-llvm/tbmintrin.h @@ -0,0 +1,140 @@ +/*===---- tbmintrin.h - TBM intrinsics -------------------------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __X86INTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __TBMINTRIN_H +#define __TBMINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("tbm"))) + +#define __bextri_u32(a, b) \ + ((unsigned int)__builtin_ia32_bextri_u32((unsigned int)(a), \ + (unsigned int)(b))) + +static __inline__ unsigned int __DEFAULT_FN_ATTRS +__blcfill_u32(unsigned int __a) +{ + return __a & (__a + 1); +} + +static __inline__ unsigned int __DEFAULT_FN_ATTRS +__blci_u32(unsigned int __a) +{ + return __a | ~(__a + 1); +} + +static __inline__ unsigned int __DEFAULT_FN_ATTRS +__blcic_u32(unsigned int __a) +{ + return ~__a & (__a + 1); +} + +static __inline__ unsigned int __DEFAULT_FN_ATTRS +__blcmsk_u32(unsigned int __a) +{ + return __a ^ (__a + 1); +} + +static __inline__ unsigned int __DEFAULT_FN_ATTRS +__blcs_u32(unsigned int __a) +{ + return __a | (__a + 1); +} + +static __inline__ unsigned int __DEFAULT_FN_ATTRS +__blsfill_u32(unsigned int __a) +{ + return __a | (__a - 1); +} + +static __inline__ unsigned int __DEFAULT_FN_ATTRS +__blsic_u32(unsigned int __a) +{ + return ~__a | (__a - 1); +} + +static __inline__ unsigned int __DEFAULT_FN_ATTRS +__t1mskc_u32(unsigned int __a) +{ + return ~__a | (__a + 1); +} + +static __inline__ unsigned int __DEFAULT_FN_ATTRS +__tzmsk_u32(unsigned int __a) +{ + return ~__a & (__a - 1); +} + +#ifdef __x86_64__ +#define __bextri_u64(a, b) \ + ((unsigned long long)__builtin_ia32_bextri_u64((unsigned long long)(a), \ + (unsigned long long)(b))) + +static __inline__ unsigned long long __DEFAULT_FN_ATTRS +__blcfill_u64(unsigned long long __a) +{ + return __a & (__a + 1); +} + +static __inline__ unsigned long long __DEFAULT_FN_ATTRS +__blci_u64(unsigned long long __a) +{ + return __a | ~(__a + 1); +} + +static __inline__ unsigned long long __DEFAULT_FN_ATTRS +__blcic_u64(unsigned long long __a) +{ + return ~__a & (__a + 1); +} + +static __inline__ unsigned long long __DEFAULT_FN_ATTRS +__blcmsk_u64(unsigned long long __a) +{ + return __a ^ (__a + 1); +} + +static __inline__ unsigned long long __DEFAULT_FN_ATTRS +__blcs_u64(unsigned long long __a) +{ + return __a | (__a + 1); +} + +static __inline__ unsigned long long __DEFAULT_FN_ATTRS +__blsfill_u64(unsigned long long __a) +{ + return __a | (__a - 1); +} + +static __inline__ unsigned long long __DEFAULT_FN_ATTRS +__blsic_u64(unsigned long long __a) +{ + return ~__a | (__a - 1); +} + +static __inline__ unsigned long long __DEFAULT_FN_ATTRS +__t1mskc_u64(unsigned long long __a) +{ + return ~__a | (__a + 1); +} + +static __inline__ unsigned long long __DEFAULT_FN_ATTRS +__tzmsk_u64(unsigned long long __a) +{ + return ~__a & (__a - 1); +} +#endif + +#undef __DEFAULT_FN_ATTRS + +#endif /* __TBMINTRIN_H */ diff --git a/include-llvm/tmmintrin.h b/include-llvm/tmmintrin.h new file mode 100644 index 0000000..e640934 --- /dev/null +++ b/include-llvm/tmmintrin.h @@ -0,0 +1,787 @@ +/*===---- tmmintrin.h - SSSE3 intrinsics -----------------------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __TMMINTRIN_H +#define __TMMINTRIN_H + +#if !defined(__i386__) && !defined(__x86_64__) +#error "This header is only meant to be used on x86 and x64 architecture" +#endif + +#include + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("ssse3"), __min_vector_width__(64))) +#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,ssse3"), __min_vector_width__(64))) + +/// Computes the absolute value of each of the packed 8-bit signed +/// integers in the source operand and stores the 8-bit unsigned integer +/// results in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c PABSB instruction. +/// +/// \param __a +/// A 64-bit vector of [8 x i8]. +/// \returns A 64-bit integer vector containing the absolute values of the +/// elements in the operand. +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +_mm_abs_pi8(__m64 __a) +{ + return (__m64)__builtin_ia32_pabsb((__v8qi)__a); +} + +/// Computes the absolute value of each of the packed 8-bit signed +/// integers in the source operand and stores the 8-bit unsigned integer +/// results in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPABSB instruction. +/// +/// \param __a +/// A 128-bit vector of [16 x i8]. +/// \returns A 128-bit integer vector containing the absolute values of the +/// elements in the operand. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_abs_epi8(__m128i __a) +{ +#if (__clang_major__ < 14) + return (__m128i)__builtin_ia32_pabsb128((__v16qi)__a); +#else + return (__m128i)__builtin_elementwise_abs((__v16qs)__a); +#endif +} + +/// Computes the absolute value of each of the packed 16-bit signed +/// integers in the source operand and stores the 16-bit unsigned integer +/// results in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c PABSW instruction. +/// +/// \param __a +/// A 64-bit vector of [4 x i16]. +/// \returns A 64-bit integer vector containing the absolute values of the +/// elements in the operand. +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +_mm_abs_pi16(__m64 __a) +{ + return (__m64)__builtin_ia32_pabsw((__v4hi)__a); +} + +/// Computes the absolute value of each of the packed 16-bit signed +/// integers in the source operand and stores the 16-bit unsigned integer +/// results in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPABSW instruction. +/// +/// \param __a +/// A 128-bit vector of [8 x i16]. +/// \returns A 128-bit integer vector containing the absolute values of the +/// elements in the operand. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_abs_epi16(__m128i __a) +{ +#if (__clang_major__ < 14) + return (__m128i)__builtin_ia32_pabsw128((__v8hi)__a); +#else + return (__m128i)__builtin_elementwise_abs((__v8hi)__a); +#endif +} + +/// Computes the absolute value of each of the packed 32-bit signed +/// integers in the source operand and stores the 32-bit unsigned integer +/// results in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c PABSD instruction. +/// +/// \param __a +/// A 64-bit vector of [2 x i32]. +/// \returns A 64-bit integer vector containing the absolute values of the +/// elements in the operand. +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +_mm_abs_pi32(__m64 __a) +{ + return (__m64)__builtin_ia32_pabsd((__v2si)__a); +} + +/// Computes the absolute value of each of the packed 32-bit signed +/// integers in the source operand and stores the 32-bit unsigned integer +/// results in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPABSD instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x i32]. +/// \returns A 128-bit integer vector containing the absolute values of the +/// elements in the operand. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_abs_epi32(__m128i __a) +{ +#if (__clang_major__ < 14) + return (__m128i)__builtin_ia32_pabsd128((__v4si)__a); +#else + return (__m128i)__builtin_elementwise_abs((__v4si)__a); +#endif +} + +/// Concatenates the two 128-bit integer vector operands, and +/// right-shifts the result by the number of bytes specified in the immediate +/// operand. +/// +/// \headerfile +/// +/// \code +/// __m128i _mm_alignr_epi8(__m128i a, __m128i b, const int n); +/// \endcode +/// +/// This intrinsic corresponds to the \c PALIGNR instruction. +/// +/// \param a +/// A 128-bit vector of [16 x i8] containing one of the source operands. +/// \param b +/// A 128-bit vector of [16 x i8] containing one of the source operands. +/// \param n +/// An immediate operand specifying how many bytes to right-shift the result. +/// \returns A 128-bit integer vector containing the concatenated right-shifted +/// value. +#define _mm_alignr_epi8(a, b, n) \ + ((__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \ + (__v16qi)(__m128i)(b), (n))) + +/// Concatenates the two 64-bit integer vector operands, and right-shifts +/// the result by the number of bytes specified in the immediate operand. +/// +/// \headerfile +/// +/// \code +/// __m64 _mm_alignr_pi8(__m64 a, __m64 b, const int n); +/// \endcode +/// +/// This intrinsic corresponds to the \c PALIGNR instruction. +/// +/// \param a +/// A 64-bit vector of [8 x i8] containing one of the source operands. +/// \param b +/// A 64-bit vector of [8 x i8] containing one of the source operands. +/// \param n +/// An immediate operand specifying how many bytes to right-shift the result. +/// \returns A 64-bit integer vector containing the concatenated right-shifted +/// value. +#define _mm_alignr_pi8(a, b, n) \ + ((__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n))) + +/// Horizontally adds the adjacent pairs of values contained in 2 packed +/// 128-bit vectors of [8 x i16]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPHADDW instruction. +/// +/// \param __a +/// A 128-bit vector of [8 x i16] containing one of the source operands. The +/// horizontal sums of the values are stored in the lower bits of the +/// destination. +/// \param __b +/// A 128-bit vector of [8 x i16] containing one of the source operands. The +/// horizontal sums of the values are stored in the upper bits of the +/// destination. +/// \returns A 128-bit vector of [8 x i16] containing the horizontal sums of +/// both operands. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_hadd_epi16(__m128i __a, __m128i __b) +{ + return (__m128i)__builtin_ia32_phaddw128((__v8hi)__a, (__v8hi)__b); +} + +/// Horizontally adds the adjacent pairs of values contained in 2 packed +/// 128-bit vectors of [4 x i32]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPHADDD instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x i32] containing one of the source operands. The +/// horizontal sums of the values are stored in the lower bits of the +/// destination. +/// \param __b +/// A 128-bit vector of [4 x i32] containing one of the source operands. The +/// horizontal sums of the values are stored in the upper bits of the +/// destination. +/// \returns A 128-bit vector of [4 x i32] containing the horizontal sums of +/// both operands. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_hadd_epi32(__m128i __a, __m128i __b) +{ + return (__m128i)__builtin_ia32_phaddd128((__v4si)__a, (__v4si)__b); +} + +/// Horizontally adds the adjacent pairs of values contained in 2 packed +/// 64-bit vectors of [4 x i16]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c PHADDW instruction. +/// +/// \param __a +/// A 64-bit vector of [4 x i16] containing one of the source operands. The +/// horizontal sums of the values are stored in the lower bits of the +/// destination. +/// \param __b +/// A 64-bit vector of [4 x i16] containing one of the source operands. The +/// horizontal sums of the values are stored in the upper bits of the +/// destination. +/// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of both +/// operands. +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +_mm_hadd_pi16(__m64 __a, __m64 __b) +{ + return (__m64)__builtin_ia32_phaddw((__v4hi)__a, (__v4hi)__b); +} + +/// Horizontally adds the adjacent pairs of values contained in 2 packed +/// 64-bit vectors of [2 x i32]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c PHADDD instruction. +/// +/// \param __a +/// A 64-bit vector of [2 x i32] containing one of the source operands. The +/// horizontal sums of the values are stored in the lower bits of the +/// destination. +/// \param __b +/// A 64-bit vector of [2 x i32] containing one of the source operands. The +/// horizontal sums of the values are stored in the upper bits of the +/// destination. +/// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of both +/// operands. +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +_mm_hadd_pi32(__m64 __a, __m64 __b) +{ + return (__m64)__builtin_ia32_phaddd((__v2si)__a, (__v2si)__b); +} + +/// Horizontally adds the adjacent pairs of values contained in 2 packed +/// 128-bit vectors of [8 x i16]. Positive sums greater than 0x7FFF are +/// saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to +/// 0x8000. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPHADDSW instruction. +/// +/// \param __a +/// A 128-bit vector of [8 x i16] containing one of the source operands. The +/// horizontal sums of the values are stored in the lower bits of the +/// destination. +/// \param __b +/// A 128-bit vector of [8 x i16] containing one of the source operands. The +/// horizontal sums of the values are stored in the upper bits of the +/// destination. +/// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated +/// sums of both operands. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_hadds_epi16(__m128i __a, __m128i __b) +{ + return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b); +} + +/// Horizontally adds the adjacent pairs of values contained in 2 packed +/// 64-bit vectors of [4 x i16]. Positive sums greater than 0x7FFF are +/// saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to +/// 0x8000. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c PHADDSW instruction. +/// +/// \param __a +/// A 64-bit vector of [4 x i16] containing one of the source operands. The +/// horizontal sums of the values are stored in the lower bits of the +/// destination. +/// \param __b +/// A 64-bit vector of [4 x i16] containing one of the source operands. The +/// horizontal sums of the values are stored in the upper bits of the +/// destination. +/// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated +/// sums of both operands. +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +_mm_hadds_pi16(__m64 __a, __m64 __b) +{ + return (__m64)__builtin_ia32_phaddsw((__v4hi)__a, (__v4hi)__b); +} + +/// Horizontally subtracts the adjacent pairs of values contained in 2 +/// packed 128-bit vectors of [8 x i16]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPHSUBW instruction. +/// +/// \param __a +/// A 128-bit vector of [8 x i16] containing one of the source operands. The +/// horizontal differences between the values are stored in the lower bits of +/// the destination. +/// \param __b +/// A 128-bit vector of [8 x i16] containing one of the source operands. The +/// horizontal differences between the values are stored in the upper bits of +/// the destination. +/// \returns A 128-bit vector of [8 x i16] containing the horizontal differences +/// of both operands. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_hsub_epi16(__m128i __a, __m128i __b) +{ + return (__m128i)__builtin_ia32_phsubw128((__v8hi)__a, (__v8hi)__b); +} + +/// Horizontally subtracts the adjacent pairs of values contained in 2 +/// packed 128-bit vectors of [4 x i32]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPHSUBD instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x i32] containing one of the source operands. The +/// horizontal differences between the values are stored in the lower bits of +/// the destination. +/// \param __b +/// A 128-bit vector of [4 x i32] containing one of the source operands. The +/// horizontal differences between the values are stored in the upper bits of +/// the destination. +/// \returns A 128-bit vector of [4 x i32] containing the horizontal differences +/// of both operands. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_hsub_epi32(__m128i __a, __m128i __b) +{ + return (__m128i)__builtin_ia32_phsubd128((__v4si)__a, (__v4si)__b); +} + +/// Horizontally subtracts the adjacent pairs of values contained in 2 +/// packed 64-bit vectors of [4 x i16]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c PHSUBW instruction. +/// +/// \param __a +/// A 64-bit vector of [4 x i16] containing one of the source operands. The +/// horizontal differences between the values are stored in the lower bits of +/// the destination. +/// \param __b +/// A 64-bit vector of [4 x i16] containing one of the source operands. The +/// horizontal differences between the values are stored in the upper bits of +/// the destination. +/// \returns A 64-bit vector of [4 x i16] containing the horizontal differences +/// of both operands. +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +_mm_hsub_pi16(__m64 __a, __m64 __b) +{ + return (__m64)__builtin_ia32_phsubw((__v4hi)__a, (__v4hi)__b); +} + +/// Horizontally subtracts the adjacent pairs of values contained in 2 +/// packed 64-bit vectors of [2 x i32]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c PHSUBD instruction. +/// +/// \param __a +/// A 64-bit vector of [2 x i32] containing one of the source operands. The +/// horizontal differences between the values are stored in the lower bits of +/// the destination. +/// \param __b +/// A 64-bit vector of [2 x i32] containing one of the source operands. The +/// horizontal differences between the values are stored in the upper bits of +/// the destination. +/// \returns A 64-bit vector of [2 x i32] containing the horizontal differences +/// of both operands. +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +_mm_hsub_pi32(__m64 __a, __m64 __b) +{ + return (__m64)__builtin_ia32_phsubd((__v2si)__a, (__v2si)__b); +} + +/// Horizontally subtracts the adjacent pairs of values contained in 2 +/// packed 128-bit vectors of [8 x i16]. Positive differences greater than +/// 0x7FFF are saturated to 0x7FFF. Negative differences less than 0x8000 are +/// saturated to 0x8000. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPHSUBSW instruction. +/// +/// \param __a +/// A 128-bit vector of [8 x i16] containing one of the source operands. The +/// horizontal differences between the values are stored in the lower bits of +/// the destination. +/// \param __b +/// A 128-bit vector of [8 x i16] containing one of the source operands. The +/// horizontal differences between the values are stored in the upper bits of +/// the destination. +/// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated +/// differences of both operands. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_hsubs_epi16(__m128i __a, __m128i __b) +{ + return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b); +} + +/// Horizontally subtracts the adjacent pairs of values contained in 2 +/// packed 64-bit vectors of [4 x i16]. Positive differences greater than +/// 0x7FFF are saturated to 0x7FFF. Negative differences less than 0x8000 are +/// saturated to 0x8000. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c PHSUBSW instruction. +/// +/// \param __a +/// A 64-bit vector of [4 x i16] containing one of the source operands. The +/// horizontal differences between the values are stored in the lower bits of +/// the destination. +/// \param __b +/// A 64-bit vector of [4 x i16] containing one of the source operands. The +/// horizontal differences between the values are stored in the upper bits of +/// the destination. +/// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated +/// differences of both operands. +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +_mm_hsubs_pi16(__m64 __a, __m64 __b) +{ + return (__m64)__builtin_ia32_phsubsw((__v4hi)__a, (__v4hi)__b); +} + +/// Multiplies corresponding pairs of packed 8-bit unsigned integer +/// values contained in the first source operand and packed 8-bit signed +/// integer values contained in the second source operand, adds pairs of +/// contiguous products with signed saturation, and writes the 16-bit sums to +/// the corresponding bits in the destination. +/// +/// For example, bits [7:0] of both operands are multiplied, bits [15:8] of +/// both operands are multiplied, and the sum of both results is written to +/// bits [15:0] of the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPMADDUBSW instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the first source operand. +/// \param __b +/// A 128-bit integer vector containing the second source operand. +/// \returns A 128-bit integer vector containing the sums of products of both +/// operands: \n +/// \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n +/// \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n +/// \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n +/// \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7) \n +/// \a R4 := (\a __a8 * \a __b8) + (\a __a9 * \a __b9) \n +/// \a R5 := (\a __a10 * \a __b10) + (\a __a11 * \a __b11) \n +/// \a R6 := (\a __a12 * \a __b12) + (\a __a13 * \a __b13) \n +/// \a R7 := (\a __a14 * \a __b14) + (\a __a15 * \a __b15) +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_maddubs_epi16(__m128i __a, __m128i __b) +{ + return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b); +} + +/// Multiplies corresponding pairs of packed 8-bit unsigned integer +/// values contained in the first source operand and packed 8-bit signed +/// integer values contained in the second source operand, adds pairs of +/// contiguous products with signed saturation, and writes the 16-bit sums to +/// the corresponding bits in the destination. +/// +/// For example, bits [7:0] of both operands are multiplied, bits [15:8] of +/// both operands are multiplied, and the sum of both results is written to +/// bits [15:0] of the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c PMADDUBSW instruction. +/// +/// \param __a +/// A 64-bit integer vector containing the first source operand. +/// \param __b +/// A 64-bit integer vector containing the second source operand. +/// \returns A 64-bit integer vector containing the sums of products of both +/// operands: \n +/// \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n +/// \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n +/// \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n +/// \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7) +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +_mm_maddubs_pi16(__m64 __a, __m64 __b) +{ + return (__m64)__builtin_ia32_pmaddubsw((__v8qi)__a, (__v8qi)__b); +} + +/// Multiplies packed 16-bit signed integer values, truncates the 32-bit +/// products to the 18 most significant bits by right-shifting, rounds the +/// truncated value by adding 1, and writes bits [16:1] to the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPMULHRSW instruction. +/// +/// \param __a +/// A 128-bit vector of [8 x i16] containing one of the source operands. +/// \param __b +/// A 128-bit vector of [8 x i16] containing one of the source operands. +/// \returns A 128-bit vector of [8 x i16] containing the rounded and scaled +/// products of both operands. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_mulhrs_epi16(__m128i __a, __m128i __b) +{ + return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b); +} + +/// Multiplies packed 16-bit signed integer values, truncates the 32-bit +/// products to the 18 most significant bits by right-shifting, rounds the +/// truncated value by adding 1, and writes bits [16:1] to the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c PMULHRSW instruction. +/// +/// \param __a +/// A 64-bit vector of [4 x i16] containing one of the source operands. +/// \param __b +/// A 64-bit vector of [4 x i16] containing one of the source operands. +/// \returns A 64-bit vector of [4 x i16] containing the rounded and scaled +/// products of both operands. +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +_mm_mulhrs_pi16(__m64 __a, __m64 __b) +{ + return (__m64)__builtin_ia32_pmulhrsw((__v4hi)__a, (__v4hi)__b); +} + +/// Copies the 8-bit integers from a 128-bit integer vector to the +/// destination or clears 8-bit values in the destination, as specified by +/// the second source operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPSHUFB instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the values to be copied. +/// \param __b +/// A 128-bit integer vector containing control bytes corresponding to +/// positions in the destination: +/// Bit 7: \n +/// 1: Clear the corresponding byte in the destination. \n +/// 0: Copy the selected source byte to the corresponding byte in the +/// destination. \n +/// Bits [6:4] Reserved. \n +/// Bits [3:0] select the source byte to be copied. +/// \returns A 128-bit integer vector containing the copied or cleared values. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_shuffle_epi8(__m128i __a, __m128i __b) +{ + return (__m128i)__builtin_ia32_pshufb128((__v16qi)__a, (__v16qi)__b); +} + +/// Copies the 8-bit integers from a 64-bit integer vector to the +/// destination or clears 8-bit values in the destination, as specified by +/// the second source operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c PSHUFB instruction. +/// +/// \param __a +/// A 64-bit integer vector containing the values to be copied. +/// \param __b +/// A 64-bit integer vector containing control bytes corresponding to +/// positions in the destination: +/// Bit 7: \n +/// 1: Clear the corresponding byte in the destination. \n +/// 0: Copy the selected source byte to the corresponding byte in the +/// destination. \n +/// Bits [3:0] select the source byte to be copied. +/// \returns A 64-bit integer vector containing the copied or cleared values. +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +_mm_shuffle_pi8(__m64 __a, __m64 __b) +{ + return (__m64)__builtin_ia32_pshufb((__v8qi)__a, (__v8qi)__b); +} + +/// For each 8-bit integer in the first source operand, perform one of +/// the following actions as specified by the second source operand. +/// +/// If the byte in the second source is negative, calculate the two's +/// complement of the corresponding byte in the first source, and write that +/// value to the destination. If the byte in the second source is positive, +/// copy the corresponding byte from the first source to the destination. If +/// the byte in the second source is zero, clear the corresponding byte in +/// the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPSIGNB instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the values to be copied. +/// \param __b +/// A 128-bit integer vector containing control bytes corresponding to +/// positions in the destination. +/// \returns A 128-bit integer vector containing the resultant values. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_sign_epi8(__m128i __a, __m128i __b) +{ + return (__m128i)__builtin_ia32_psignb128((__v16qi)__a, (__v16qi)__b); +} + +/// For each 16-bit integer in the first source operand, perform one of +/// the following actions as specified by the second source operand. +/// +/// If the word in the second source is negative, calculate the two's +/// complement of the corresponding word in the first source, and write that +/// value to the destination. If the word in the second source is positive, +/// copy the corresponding word from the first source to the destination. If +/// the word in the second source is zero, clear the corresponding word in +/// the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPSIGNW instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the values to be copied. +/// \param __b +/// A 128-bit integer vector containing control words corresponding to +/// positions in the destination. +/// \returns A 128-bit integer vector containing the resultant values. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_sign_epi16(__m128i __a, __m128i __b) +{ + return (__m128i)__builtin_ia32_psignw128((__v8hi)__a, (__v8hi)__b); +} + +/// For each 32-bit integer in the first source operand, perform one of +/// the following actions as specified by the second source operand. +/// +/// If the doubleword in the second source is negative, calculate the two's +/// complement of the corresponding word in the first source, and write that +/// value to the destination. If the doubleword in the second source is +/// positive, copy the corresponding word from the first source to the +/// destination. If the doubleword in the second source is zero, clear the +/// corresponding word in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c VPSIGND instruction. +/// +/// \param __a +/// A 128-bit integer vector containing the values to be copied. +/// \param __b +/// A 128-bit integer vector containing control doublewords corresponding to +/// positions in the destination. +/// \returns A 128-bit integer vector containing the resultant values. +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_sign_epi32(__m128i __a, __m128i __b) +{ + return (__m128i)__builtin_ia32_psignd128((__v4si)__a, (__v4si)__b); +} + +/// For each 8-bit integer in the first source operand, perform one of +/// the following actions as specified by the second source operand. +/// +/// If the byte in the second source is negative, calculate the two's +/// complement of the corresponding byte in the first source, and write that +/// value to the destination. If the byte in the second source is positive, +/// copy the corresponding byte from the first source to the destination. If +/// the byte in the second source is zero, clear the corresponding byte in +/// the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c PSIGNB instruction. +/// +/// \param __a +/// A 64-bit integer vector containing the values to be copied. +/// \param __b +/// A 64-bit integer vector containing control bytes corresponding to +/// positions in the destination. +/// \returns A 64-bit integer vector containing the resultant values. +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +_mm_sign_pi8(__m64 __a, __m64 __b) +{ + return (__m64)__builtin_ia32_psignb((__v8qi)__a, (__v8qi)__b); +} + +/// For each 16-bit integer in the first source operand, perform one of +/// the following actions as specified by the second source operand. +/// +/// If the word in the second source is negative, calculate the two's +/// complement of the corresponding word in the first source, and write that +/// value to the destination. If the word in the second source is positive, +/// copy the corresponding word from the first source to the destination. If +/// the word in the second source is zero, clear the corresponding word in +/// the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c PSIGNW instruction. +/// +/// \param __a +/// A 64-bit integer vector containing the values to be copied. +/// \param __b +/// A 64-bit integer vector containing control words corresponding to +/// positions in the destination. +/// \returns A 64-bit integer vector containing the resultant values. +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +_mm_sign_pi16(__m64 __a, __m64 __b) +{ + return (__m64)__builtin_ia32_psignw((__v4hi)__a, (__v4hi)__b); +} + +/// For each 32-bit integer in the first source operand, perform one of +/// the following actions as specified by the second source operand. +/// +/// If the doubleword in the second source is negative, calculate the two's +/// complement of the corresponding doubleword in the first source, and +/// write that value to the destination. If the doubleword in the second +/// source is positive, copy the corresponding doubleword from the first +/// source to the destination. If the doubleword in the second source is +/// zero, clear the corresponding doubleword in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c PSIGND instruction. +/// +/// \param __a +/// A 64-bit integer vector containing the values to be copied. +/// \param __b +/// A 64-bit integer vector containing two control doublewords corresponding +/// to positions in the destination. +/// \returns A 64-bit integer vector containing the resultant values. +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +_mm_sign_pi32(__m64 __a, __m64 __b) +{ + return (__m64)__builtin_ia32_psignd((__v2si)__a, (__v2si)__b); +} + +#undef __DEFAULT_FN_ATTRS +#undef __DEFAULT_FN_ATTRS_MMX + +#endif /* __TMMINTRIN_H */ diff --git a/include-llvm/tsxldtrkintrin.h b/include-llvm/tsxldtrkintrin.h new file mode 100644 index 0000000..491823e --- /dev/null +++ b/include-llvm/tsxldtrkintrin.h @@ -0,0 +1,56 @@ +/*===------------- tsxldtrkintrin.h - tsxldtrk intrinsics ------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __TSXLDTRKINTRIN_H +#define __TSXLDTRKINTRIN_H + +/* Define the default attributes for the functions in this file */ +#define _DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("tsxldtrk"))) + +/// Marks the start of an TSX (RTM) suspend load address tracking region. If +/// this intrinsic is used inside a transactional region, subsequent loads +/// are not added to the read set of the transaction. If it's used inside a +/// suspend load address tracking region it will cause transaction abort. +/// If it's used outside of a transactional region it behaves like a NOP. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c XSUSLDTRK instruction. +/// +static __inline__ void _DEFAULT_FN_ATTRS +_xsusldtrk (void) +{ + __builtin_ia32_xsusldtrk(); +} + +/// Marks the end of an TSX (RTM) suspend load address tracking region. If this +/// intrinsic is used inside a suspend load address tracking region it will +/// end the suspend region and all following load addresses will be added to +/// the transaction read set. If it's used inside an active transaction but +/// not in a suspend region it will cause transaction abort. If it's used +/// outside of a transactional region it behaves like a NOP. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the \c XRESLDTRK instruction. +/// +static __inline__ void _DEFAULT_FN_ATTRS +_xresldtrk (void) +{ + __builtin_ia32_xresldtrk(); +} + +#undef _DEFAULT_FN_ATTRS + +#endif /* __TSXLDTRKINTRIN_H */ diff --git a/include-llvm/uintrintrin.h b/include-llvm/uintrintrin.h new file mode 100644 index 0000000..e3839dc --- /dev/null +++ b/include-llvm/uintrintrin.h @@ -0,0 +1,157 @@ +/*===------------------ uintrintrin.h - UINTR intrinsics -------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __X86GPRINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __UINTRINTRIN_H +#define __UINTRINTRIN_H + +/* Define the default attributes for the functions in this file */ +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("uintr"))) + +#ifdef __x86_64__ + +struct __uintr_frame +{ + unsigned long long rip; + unsigned long long rflags; + unsigned long long rsp; +}; + +/// Clears the user interrupt flag (UIF). Its effect takes place immediately: a +/// user interrupt cannot be delivered on the instruction boundary following +/// CLUI. Can be executed only if CR4.UINT = 1, the logical processor is in +/// 64-bit mode, and software is not executing inside an enclave; otherwise, +/// each causes an invalid-opcode exception. Causes a transactional abort if +/// executed inside a transactional region; the abort loads EAX as it would +/// had it been due to an execution of CLI. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the CLUI instruction. +/// +/// \operation +/// UIF := 0 +/// \endoperation +static __inline__ void __DEFAULT_FN_ATTRS +_clui (void) +{ + __builtin_ia32_clui(); +} + +/// Sets the user interrupt flag (UIF). Its effect takes place immediately; a +/// user interrupt may be delivered on the instruction boundary following +/// STUI. Can be executed only if CR4.UINT = 1, the logical processor is in +/// 64-bit mode, and software is not executing inside an enclave; otherwise, +/// each causes an invalid-opcode exception. Causes a transactional abort if +/// executed inside a transactional region; the abort loads EAX as it would +/// had it been due to an execution of STI. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the STUI instruction. +/// +/// \operation +/// UIF := 1 +/// \endoperation +static __inline__ void __DEFAULT_FN_ATTRS +_stui (void) +{ + __builtin_ia32_stui(); +} + +/// Get the current value of the user interrupt flag (UIF). Can be executed +/// regardless of CPL and inside a transactional region. Can be executed only +/// if CR4.UINT = 1, the logical processor is in 64-bit mode, and software is +/// not executing inside an enclave; otherwise, it causes an invalid-opcode +/// exception. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the TESTUI instruction. +/// +/// \returns The current value of the user interrupt flag (UIF). +/// +/// \operation +/// CF := UIF +/// ZF := 0 +/// AF := 0 +/// OF := 0 +/// PF := 0 +/// SF := 0 +/// dst := CF +/// \endoperation +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_testui (void) +{ + return __builtin_ia32_testui(); +} + +/// Send interprocessor user interrupt. Can be executed only if +/// CR4.UINT = IA32_UINT_TT[0] = 1, the logical processor is in 64-bit mode, +/// and software is not executing inside an enclave; otherwise, it causes an +/// invalid-opcode exception. May be executed at any privilege level, all of +/// its memory accesses are performed with supervisor privilege. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the SENDUIPI instruction +/// +/// \param __a +/// Index of user-interrupt target table entry in user-interrupt target +/// table. +/// +/// \operation +/// IF __a > UITTSZ +/// GP (0) +/// FI +/// tempUITTE := MEM[UITTADDR + (a<<4)] +/// // tempUITTE must be valid, and can't have any reserved bit set +/// IF (tempUITTE.V == 0 OR tempUITTE[7:1] != 0) +/// GP (0) +/// FI +/// tempUPID := MEM[tempUITTE.UPIDADDR] // under lock +/// // tempUPID can't have any reserved bit set +/// IF (tempUPID[15:2] != 0 OR tempUPID[31:24] != 0) +/// GP (0) // release lock +/// FI +/// tempUPID.PIR[tempUITTE.UV] := 1; +/// IF (tempUPID.SN == 0 AND tempUPID.ON == 0) +/// tempUPID.ON := 1 +/// sendNotify := 1 +/// ELSE +/// sendNotify := 0 +/// FI +/// MEM[tempUITTE.UPIDADDR] := tempUPID // release lock +/// IF sendNotify == 1 +/// IF IA32_APIC_BASE[10] == 1 // local APIC is in x2APIC mode +/// // send ordinary IPI with vector tempUPID.NV to 32-bit physical APIC +/// // ID tempUPID.NDST +/// SendOrdinaryIPI(tempUPID.NV, tempUPID.NDST) +/// ELSE +/// // send ordinary IPI with vector tempUPID.NV to 8-bit physical APIC +/// // ID tempUPID.NDST[15:8] +/// SendOrdinaryIPI(tempUPID.NV, tempUPID.NDST[15:8]) +/// FI +/// FI +/// \endoperation +static __inline__ void __DEFAULT_FN_ATTRS +_senduipi (unsigned long long __a) +{ + __builtin_ia32_senduipi(__a); +} + +#endif /* __x86_64__ */ + +#undef __DEFAULT_FN_ATTRS + +#endif /* __UINTRINTRIN_H */ diff --git a/include-llvm/vaesintrin.h b/include-llvm/vaesintrin.h new file mode 100644 index 0000000..294dcff --- /dev/null +++ b/include-llvm/vaesintrin.h @@ -0,0 +1,85 @@ +/*===------------------ vaesintrin.h - VAES intrinsics ---------------------=== + * + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __VAESINTRIN_H +#define __VAESINTRIN_H + +/* Default attributes for YMM forms. */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("vaes"), __min_vector_width__(256))) + +/* Default attributes for ZMM forms. */ +#define __DEFAULT_FN_ATTRS_F __attribute__((__always_inline__, __nodebug__, __target__("avx512f,vaes"), __min_vector_width__(512))) + + +static __inline__ __m256i __DEFAULT_FN_ATTRS + _mm256_aesenc_epi128(__m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_aesenc256((__v4di) __A, + (__v4di) __B); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS + _mm256_aesdec_epi128(__m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_aesdec256((__v4di) __A, + (__v4di) __B); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS + _mm256_aesenclast_epi128(__m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_aesenclast256((__v4di) __A, + (__v4di) __B); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS + _mm256_aesdeclast_epi128(__m256i __A, __m256i __B) +{ + return (__m256i) __builtin_ia32_aesdeclast256((__v4di) __A, + (__v4di) __B); +} + +#ifdef __AVX512FINTRIN_H +static __inline__ __m512i __DEFAULT_FN_ATTRS_F + _mm512_aesenc_epi128(__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_aesenc512((__v8di) __A, + (__v8di) __B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS_F + _mm512_aesdec_epi128(__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_aesdec512((__v8di) __A, + (__v8di) __B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS_F + _mm512_aesenclast_epi128(__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_aesenclast512((__v8di) __A, + (__v8di) __B); +} + +static __inline__ __m512i __DEFAULT_FN_ATTRS_F + _mm512_aesdeclast_epi128(__m512i __A, __m512i __B) +{ + return (__m512i) __builtin_ia32_aesdeclast512((__v8di) __A, + (__v8di) __B); +} +#endif // __AVX512FINTRIN_H + +#undef __DEFAULT_FN_ATTRS +#undef __DEFAULT_FN_ATTRS_F + +#endif // __VAESINTRIN_H diff --git a/include-llvm/vpclmulqdqintrin.h b/include-llvm/vpclmulqdqintrin.h new file mode 100644 index 0000000..485692e --- /dev/null +++ b/include-llvm/vpclmulqdqintrin.h @@ -0,0 +1,30 @@ +/*===------------ vpclmulqdqintrin.h - VPCLMULQDQ intrinsics ---------------=== + * + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __VPCLMULQDQINTRIN_H +#define __VPCLMULQDQINTRIN_H + +#define _mm256_clmulepi64_epi128(A, B, I) \ + ((__m256i)__builtin_ia32_pclmulqdq256((__v4di)(__m256i)(A), \ + (__v4di)(__m256i)(B), \ + (char)(I))) + +#ifdef __AVX512FINTRIN_H +#define _mm512_clmulepi64_epi128(A, B, I) \ + ((__m512i)__builtin_ia32_pclmulqdq512((__v8di)(__m512i)(A), \ + (__v8di)(__m512i)(B), \ + (char)(I))) +#endif // __AVX512FINTRIN_H + +#endif /* __VPCLMULQDQINTRIN_H */ + diff --git a/include-llvm/waitpkgintrin.h b/include-llvm/waitpkgintrin.h new file mode 100644 index 0000000..7ecada4 --- /dev/null +++ b/include-llvm/waitpkgintrin.h @@ -0,0 +1,42 @@ +/*===----------------------- waitpkgintrin.h - WAITPKG --------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ +#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __WAITPKGINTRIN_H +#define __WAITPKGINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS \ + __attribute__((__always_inline__, __nodebug__, __target__("waitpkg"))) + +static __inline__ void __DEFAULT_FN_ATTRS +_umonitor (void * __address) +{ + __builtin_ia32_umonitor (__address); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_umwait (unsigned int __control, unsigned long long __counter) +{ + return __builtin_ia32_umwait (__control, + (unsigned int)(__counter >> 32), (unsigned int)__counter); +} + +static __inline__ unsigned char __DEFAULT_FN_ATTRS +_tpause (unsigned int __control, unsigned long long __counter) +{ + return __builtin_ia32_tpause (__control, + (unsigned int)(__counter >> 32), (unsigned int)__counter); +} + +#undef __DEFAULT_FN_ATTRS + +#endif /* __WAITPKGINTRIN_H */ diff --git a/include-llvm/wbnoinvdintrin.h b/include-llvm/wbnoinvdintrin.h new file mode 100644 index 0000000..cac0347 --- /dev/null +++ b/include-llvm/wbnoinvdintrin.h @@ -0,0 +1,24 @@ +/*===-------------- wbnoinvdintrin.h - wbnoinvd intrinsic-------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __WBNOINVDINTRIN_H +#define __WBNOINVDINTRIN_H + +static __inline__ void + __attribute__((__always_inline__, __nodebug__, __target__("wbnoinvd"))) +_wbnoinvd (void) +{ + __builtin_ia32_wbnoinvd (); +} + +#endif /* __WBNOINVDINTRIN_H */ diff --git a/include-llvm/wmmintrin.h b/include-llvm/wmmintrin.h new file mode 100644 index 0000000..49148db --- /dev/null +++ b/include-llvm/wmmintrin.h @@ -0,0 +1,23 @@ +/*===---- wmmintrin.h - AES intrinsics ------------------------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __WMMINTRIN_H +#define __WMMINTRIN_H + +#if !defined(__i386__) && !defined(__x86_64__) +#error "This header is only meant to be used on x86 and x64 architecture" +#endif + +#include + +#include <__wmmintrin_aes.h> + +#include <__wmmintrin_pclmul.h> + +#endif /* __WMMINTRIN_H */ diff --git a/include-llvm/x86gprintrin.h b/include-llvm/x86gprintrin.h new file mode 100644 index 0000000..01e741f --- /dev/null +++ b/include-llvm/x86gprintrin.h @@ -0,0 +1,35 @@ +/*===--------------- x86gprintrin.h - X86 GPR intrinsics ------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __X86GPRINTRIN_H +#define __X86GPRINTRIN_H + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__HRESET__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__UINTR__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__CRC32__) +#include +#endif + +#define __SSC_MARK(Tag) \ + __asm__ __volatile__("mov {%%ebx, %%eax|eax, ebx}; " \ + "mov {%0, %%ebx|ebx, %0}; " \ + ".byte 0x64, 0x67, 0x90; " \ + "mov {%%eax, %%ebx|ebx, eax};" ::"i"(Tag) \ + : "%eax"); + +#endif /* __X86GPRINTRIN_H */ diff --git a/include-llvm/x86intrin.h b/include-llvm/x86intrin.h new file mode 100644 index 0000000..768d0e5 --- /dev/null +++ b/include-llvm/x86intrin.h @@ -0,0 +1,63 @@ +/*===---- x86intrin.h - X86 intrinsics -------------------------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __X86INTRIN_H +#define __X86INTRIN_H + +#include + +#include + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__3dNOW__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__PRFCHW__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__SSE4A__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__FMA4__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__XOP__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__TBM__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__LWP__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__MWAITX__) +#include +#endif + +#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ + defined(__CLZERO__) +#include +#endif + + +#endif /* __X86INTRIN_H */ diff --git a/include-llvm/xmmintrin.h b/include-llvm/xmmintrin.h new file mode 100644 index 0000000..1612d3d --- /dev/null +++ b/include-llvm/xmmintrin.h @@ -0,0 +1,3012 @@ +/*===---- xmmintrin.h - SSE intrinsics -------------------------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __XMMINTRIN_H +#define __XMMINTRIN_H + +#if !defined(__i386__) && !defined(__x86_64__) +#error "This header is only meant to be used on x86 and x64 architecture" +#endif + +#include + +typedef int __v4si __attribute__((__vector_size__(16))); +typedef float __v4sf __attribute__((__vector_size__(16))); +typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16))); + +typedef float __m128_u __attribute__((__vector_size__(16), __aligned__(1))); + +/* Unsigned types */ +typedef unsigned int __v4su __attribute__((__vector_size__(16))); + +/* This header should only be included in a hosted environment as it depends on + * a standard library to provide allocation routines. */ +#if __STDC_HOSTED__ +#include +#endif + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse"), __min_vector_width__(128))) +#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse"), __min_vector_width__(64))) + +/// Adds the 32-bit float values in the low-order bits of the operands. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VADDSS / ADDSS instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing one of the source operands. +/// The lower 32 bits of this operand are used in the calculation. +/// \param __b +/// A 128-bit vector of [4 x float] containing one of the source operands. +/// The lower 32 bits of this operand are used in the calculation. +/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum +/// of the lower 32 bits of both operands. The upper 96 bits are copied from +/// the upper 96 bits of the first source operand. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_add_ss(__m128 __a, __m128 __b) +{ + __a[0] += __b[0]; + return __a; +} + +/// Adds two 128-bit vectors of [4 x float], and returns the results of +/// the addition. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VADDPS / ADDPS instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing one of the source operands. +/// \param __b +/// A 128-bit vector of [4 x float] containing one of the source operands. +/// \returns A 128-bit vector of [4 x float] containing the sums of both +/// operands. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_add_ps(__m128 __a, __m128 __b) +{ + return (__m128)((__v4sf)__a + (__v4sf)__b); +} + +/// Subtracts the 32-bit float value in the low-order bits of the second +/// operand from the corresponding value in the first operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VSUBSS / SUBSS instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits +/// of this operand are used in the calculation. +/// \param __b +/// A 128-bit vector of [4 x float] containing the subtrahend. The lower 32 +/// bits of this operand are used in the calculation. +/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the +/// difference of the lower 32 bits of both operands. The upper 96 bits are +/// copied from the upper 96 bits of the first source operand. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_sub_ss(__m128 __a, __m128 __b) +{ + __a[0] -= __b[0]; + return __a; +} + +/// Subtracts each of the values of the second operand from the first +/// operand, both of which are 128-bit vectors of [4 x float] and returns +/// the results of the subtraction. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VSUBPS / SUBPS instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing the minuend. +/// \param __b +/// A 128-bit vector of [4 x float] containing the subtrahend. +/// \returns A 128-bit vector of [4 x float] containing the differences between +/// both operands. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_sub_ps(__m128 __a, __m128 __b) +{ + return (__m128)((__v4sf)__a - (__v4sf)__b); +} + +/// Multiplies two 32-bit float values in the low-order bits of the +/// operands. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMULSS / MULSS instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing one of the source operands. +/// The lower 32 bits of this operand are used in the calculation. +/// \param __b +/// A 128-bit vector of [4 x float] containing one of the source operands. +/// The lower 32 bits of this operand are used in the calculation. +/// \returns A 128-bit vector of [4 x float] containing the product of the lower +/// 32 bits of both operands. The upper 96 bits are copied from the upper 96 +/// bits of the first source operand. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_mul_ss(__m128 __a, __m128 __b) +{ + __a[0] *= __b[0]; + return __a; +} + +/// Multiplies two 128-bit vectors of [4 x float] and returns the +/// results of the multiplication. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMULPS / MULPS instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing one of the source operands. +/// \param __b +/// A 128-bit vector of [4 x float] containing one of the source operands. +/// \returns A 128-bit vector of [4 x float] containing the products of both +/// operands. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_mul_ps(__m128 __a, __m128 __b) +{ + return (__m128)((__v4sf)__a * (__v4sf)__b); +} + +/// Divides the value in the low-order 32 bits of the first operand by +/// the corresponding value in the second operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VDIVSS / DIVSS instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing the dividend. The lower 32 +/// bits of this operand are used in the calculation. +/// \param __b +/// A 128-bit vector of [4 x float] containing the divisor. The lower 32 bits +/// of this operand are used in the calculation. +/// \returns A 128-bit vector of [4 x float] containing the quotients of the +/// lower 32 bits of both operands. The upper 96 bits are copied from the +/// upper 96 bits of the first source operand. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_div_ss(__m128 __a, __m128 __b) +{ + __a[0] /= __b[0]; + return __a; +} + +/// Divides two 128-bit vectors of [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VDIVPS / DIVPS instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing the dividend. +/// \param __b +/// A 128-bit vector of [4 x float] containing the divisor. +/// \returns A 128-bit vector of [4 x float] containing the quotients of both +/// operands. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_div_ps(__m128 __a, __m128 __b) +{ + return (__m128)((__v4sf)__a / (__v4sf)__b); +} + +/// Calculates the square root of the value stored in the low-order bits +/// of a 128-bit vector of [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VSQRTSS / SQRTSS instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are +/// used in the calculation. +/// \returns A 128-bit vector of [4 x float] containing the square root of the +/// value in the low-order bits of the operand. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_sqrt_ss(__m128 __a) +{ + return (__m128)__builtin_ia32_sqrtss((__v4sf)__a); +} + +/// Calculates the square roots of the values stored in a 128-bit vector +/// of [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VSQRTPS / SQRTPS instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. +/// \returns A 128-bit vector of [4 x float] containing the square roots of the +/// values in the operand. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_sqrt_ps(__m128 __a) +{ + return __builtin_ia32_sqrtps((__v4sf)__a); +} + +/// Calculates the approximate reciprocal of the value stored in the +/// low-order bits of a 128-bit vector of [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VRCPSS / RCPSS instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are +/// used in the calculation. +/// \returns A 128-bit vector of [4 x float] containing the approximate +/// reciprocal of the value in the low-order bits of the operand. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_rcp_ss(__m128 __a) +{ + return (__m128)__builtin_ia32_rcpss((__v4sf)__a); +} + +/// Calculates the approximate reciprocals of the values stored in a +/// 128-bit vector of [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VRCPPS / RCPPS instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. +/// \returns A 128-bit vector of [4 x float] containing the approximate +/// reciprocals of the values in the operand. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_rcp_ps(__m128 __a) +{ + return (__m128)__builtin_ia32_rcpps((__v4sf)__a); +} + +/// Calculates the approximate reciprocal of the square root of the value +/// stored in the low-order bits of a 128-bit vector of [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VRSQRTSS / RSQRTSS instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are +/// used in the calculation. +/// \returns A 128-bit vector of [4 x float] containing the approximate +/// reciprocal of the square root of the value in the low-order bits of the +/// operand. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_rsqrt_ss(__m128 __a) +{ + return __builtin_ia32_rsqrtss((__v4sf)__a); +} + +/// Calculates the approximate reciprocals of the square roots of the +/// values stored in a 128-bit vector of [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VRSQRTPS / RSQRTPS instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. +/// \returns A 128-bit vector of [4 x float] containing the approximate +/// reciprocals of the square roots of the values in the operand. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_rsqrt_ps(__m128 __a) +{ + return __builtin_ia32_rsqrtps((__v4sf)__a); +} + +/// Compares two 32-bit float values in the low-order bits of both +/// operands and returns the lesser value in the low-order bits of the +/// vector of [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMINSS / MINSS instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing one of the operands. The lower +/// 32 bits of this operand are used in the comparison. +/// \param __b +/// A 128-bit vector of [4 x float] containing one of the operands. The lower +/// 32 bits of this operand are used in the comparison. +/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the +/// minimum value between both operands. The upper 96 bits are copied from +/// the upper 96 bits of the first source operand. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_min_ss(__m128 __a, __m128 __b) +{ + return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b); +} + +/// Compares two 128-bit vectors of [4 x float] and returns the lesser +/// of each pair of values. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMINPS / MINPS instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing one of the operands. +/// \param __b +/// A 128-bit vector of [4 x float] containing one of the operands. +/// \returns A 128-bit vector of [4 x float] containing the minimum values +/// between both operands. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_min_ps(__m128 __a, __m128 __b) +{ + return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b); +} + +/// Compares two 32-bit float values in the low-order bits of both +/// operands and returns the greater value in the low-order bits of a 128-bit +/// vector of [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMAXSS / MAXSS instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing one of the operands. The lower +/// 32 bits of this operand are used in the comparison. +/// \param __b +/// A 128-bit vector of [4 x float] containing one of the operands. The lower +/// 32 bits of this operand are used in the comparison. +/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the +/// maximum value between both operands. The upper 96 bits are copied from +/// the upper 96 bits of the first source operand. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_max_ss(__m128 __a, __m128 __b) +{ + return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b); +} + +/// Compares two 128-bit vectors of [4 x float] and returns the greater +/// of each pair of values. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMAXPS / MAXPS instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing one of the operands. +/// \param __b +/// A 128-bit vector of [4 x float] containing one of the operands. +/// \returns A 128-bit vector of [4 x float] containing the maximum values +/// between both operands. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_max_ps(__m128 __a, __m128 __b) +{ + return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b); +} + +/// Performs a bitwise AND of two 128-bit vectors of [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VANDPS / ANDPS instructions. +/// +/// \param __a +/// A 128-bit vector containing one of the source operands. +/// \param __b +/// A 128-bit vector containing one of the source operands. +/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the +/// values between both operands. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_and_ps(__m128 __a, __m128 __b) +{ + return (__m128)((__v4su)__a & (__v4su)__b); +} + +/// Performs a bitwise AND of two 128-bit vectors of [4 x float], using +/// the one's complement of the values contained in the first source +/// operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VANDNPS / ANDNPS instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing the first source operand. The +/// one's complement of this value is used in the bitwise AND. +/// \param __b +/// A 128-bit vector of [4 x float] containing the second source operand. +/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the +/// one's complement of the first operand and the values in the second +/// operand. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_andnot_ps(__m128 __a, __m128 __b) +{ + return (__m128)(~(__v4su)__a & (__v4su)__b); +} + +/// Performs a bitwise OR of two 128-bit vectors of [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VORPS / ORPS instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing one of the source operands. +/// \param __b +/// A 128-bit vector of [4 x float] containing one of the source operands. +/// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the +/// values between both operands. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_or_ps(__m128 __a, __m128 __b) +{ + return (__m128)((__v4su)__a | (__v4su)__b); +} + +/// Performs a bitwise exclusive OR of two 128-bit vectors of +/// [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VXORPS / XORPS instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing one of the source operands. +/// \param __b +/// A 128-bit vector of [4 x float] containing one of the source operands. +/// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR +/// of the values between both operands. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_xor_ps(__m128 __a, __m128 __b) +{ + return (__m128)((__v4su)__a ^ (__v4su)__b); +} + +/// Compares two 32-bit float values in the low-order bits of both +/// operands for equality and returns the result of the comparison in the +/// low-order bits of a vector [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCMPEQSS / CMPEQSS instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing one of the operands. The lower +/// 32 bits of this operand are used in the comparison. +/// \param __b +/// A 128-bit vector of [4 x float] containing one of the operands. The lower +/// 32 bits of this operand are used in the comparison. +/// \returns A 128-bit vector of [4 x float] containing the comparison results +/// in the low-order bits. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_cmpeq_ss(__m128 __a, __m128 __b) +{ + return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b); +} + +/// Compares each of the corresponding 32-bit float values of the +/// 128-bit vectors of [4 x float] for equality. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCMPEQPS / CMPEQPS instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. +/// \param __b +/// A 128-bit vector of [4 x float]. +/// \returns A 128-bit vector of [4 x float] containing the comparison results. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_cmpeq_ps(__m128 __a, __m128 __b) +{ + return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b); +} + +/// Compares two 32-bit float values in the low-order bits of both +/// operands to determine if the value in the first operand is less than the +/// corresponding value in the second operand and returns the result of the +/// comparison in the low-order bits of a vector of [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCMPLTSS / CMPLTSS instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing one of the operands. The lower +/// 32 bits of this operand are used in the comparison. +/// \param __b +/// A 128-bit vector of [4 x float] containing one of the operands. The lower +/// 32 bits of this operand are used in the comparison. +/// \returns A 128-bit vector of [4 x float] containing the comparison results +/// in the low-order bits. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_cmplt_ss(__m128 __a, __m128 __b) +{ + return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b); +} + +/// Compares each of the corresponding 32-bit float values of the +/// 128-bit vectors of [4 x float] to determine if the values in the first +/// operand are less than those in the second operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCMPLTPS / CMPLTPS instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. +/// \param __b +/// A 128-bit vector of [4 x float]. +/// \returns A 128-bit vector of [4 x float] containing the comparison results. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_cmplt_ps(__m128 __a, __m128 __b) +{ + return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b); +} + +/// Compares two 32-bit float values in the low-order bits of both +/// operands to determine if the value in the first operand is less than or +/// equal to the corresponding value in the second operand and returns the +/// result of the comparison in the low-order bits of a vector of +/// [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCMPLESS / CMPLESS instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing one of the operands. The lower +/// 32 bits of this operand are used in the comparison. +/// \param __b +/// A 128-bit vector of [4 x float] containing one of the operands. The lower +/// 32 bits of this operand are used in the comparison. +/// \returns A 128-bit vector of [4 x float] containing the comparison results +/// in the low-order bits. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_cmple_ss(__m128 __a, __m128 __b) +{ + return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b); +} + +/// Compares each of the corresponding 32-bit float values of the +/// 128-bit vectors of [4 x float] to determine if the values in the first +/// operand are less than or equal to those in the second operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCMPLEPS / CMPLEPS instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. +/// \param __b +/// A 128-bit vector of [4 x float]. +/// \returns A 128-bit vector of [4 x float] containing the comparison results. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_cmple_ps(__m128 __a, __m128 __b) +{ + return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b); +} + +/// Compares two 32-bit float values in the low-order bits of both +/// operands to determine if the value in the first operand is greater than +/// the corresponding value in the second operand and returns the result of +/// the comparison in the low-order bits of a vector of [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCMPLTSS / CMPLTSS instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing one of the operands. The lower +/// 32 bits of this operand are used in the comparison. +/// \param __b +/// A 128-bit vector of [4 x float] containing one of the operands. The lower +/// 32 bits of this operand are used in the comparison. +/// \returns A 128-bit vector of [4 x float] containing the comparison results +/// in the low-order bits. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_cmpgt_ss(__m128 __a, __m128 __b) +{ + return (__m128)__builtin_shufflevector((__v4sf)__a, + (__v4sf)__builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a), + 4, 1, 2, 3); +} + +/// Compares each of the corresponding 32-bit float values of the +/// 128-bit vectors of [4 x float] to determine if the values in the first +/// operand are greater than those in the second operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCMPLTPS / CMPLTPS instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. +/// \param __b +/// A 128-bit vector of [4 x float]. +/// \returns A 128-bit vector of [4 x float] containing the comparison results. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_cmpgt_ps(__m128 __a, __m128 __b) +{ + return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a); +} + +/// Compares two 32-bit float values in the low-order bits of both +/// operands to determine if the value in the first operand is greater than +/// or equal to the corresponding value in the second operand and returns +/// the result of the comparison in the low-order bits of a vector of +/// [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCMPLESS / CMPLESS instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing one of the operands. The lower +/// 32 bits of this operand are used in the comparison. +/// \param __b +/// A 128-bit vector of [4 x float] containing one of the operands. The lower +/// 32 bits of this operand are used in the comparison. +/// \returns A 128-bit vector of [4 x float] containing the comparison results +/// in the low-order bits. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_cmpge_ss(__m128 __a, __m128 __b) +{ + return (__m128)__builtin_shufflevector((__v4sf)__a, + (__v4sf)__builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a), + 4, 1, 2, 3); +} + +/// Compares each of the corresponding 32-bit float values of the +/// 128-bit vectors of [4 x float] to determine if the values in the first +/// operand are greater than or equal to those in the second operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCMPLEPS / CMPLEPS instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. +/// \param __b +/// A 128-bit vector of [4 x float]. +/// \returns A 128-bit vector of [4 x float] containing the comparison results. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_cmpge_ps(__m128 __a, __m128 __b) +{ + return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a); +} + +/// Compares two 32-bit float values in the low-order bits of both +/// operands for inequality and returns the result of the comparison in the +/// low-order bits of a vector of [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCMPNEQSS / CMPNEQSS +/// instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing one of the operands. The lower +/// 32 bits of this operand are used in the comparison. +/// \param __b +/// A 128-bit vector of [4 x float] containing one of the operands. The lower +/// 32 bits of this operand are used in the comparison. +/// \returns A 128-bit vector of [4 x float] containing the comparison results +/// in the low-order bits. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_cmpneq_ss(__m128 __a, __m128 __b) +{ + return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b); +} + +/// Compares each of the corresponding 32-bit float values of the +/// 128-bit vectors of [4 x float] for inequality. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCMPNEQPS / CMPNEQPS +/// instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. +/// \param __b +/// A 128-bit vector of [4 x float]. +/// \returns A 128-bit vector of [4 x float] containing the comparison results. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_cmpneq_ps(__m128 __a, __m128 __b) +{ + return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b); +} + +/// Compares two 32-bit float values in the low-order bits of both +/// operands to determine if the value in the first operand is not less than +/// the corresponding value in the second operand and returns the result of +/// the comparison in the low-order bits of a vector of [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCMPNLTSS / CMPNLTSS +/// instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing one of the operands. The lower +/// 32 bits of this operand are used in the comparison. +/// \param __b +/// A 128-bit vector of [4 x float] containing one of the operands. The lower +/// 32 bits of this operand are used in the comparison. +/// \returns A 128-bit vector of [4 x float] containing the comparison results +/// in the low-order bits. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_cmpnlt_ss(__m128 __a, __m128 __b) +{ + return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b); +} + +/// Compares each of the corresponding 32-bit float values of the +/// 128-bit vectors of [4 x float] to determine if the values in the first +/// operand are not less than those in the second operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCMPNLTPS / CMPNLTPS +/// instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. +/// \param __b +/// A 128-bit vector of [4 x float]. +/// \returns A 128-bit vector of [4 x float] containing the comparison results. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_cmpnlt_ps(__m128 __a, __m128 __b) +{ + return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b); +} + +/// Compares two 32-bit float values in the low-order bits of both +/// operands to determine if the value in the first operand is not less than +/// or equal to the corresponding value in the second operand and returns +/// the result of the comparison in the low-order bits of a vector of +/// [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCMPNLESS / CMPNLESS +/// instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing one of the operands. The lower +/// 32 bits of this operand are used in the comparison. +/// \param __b +/// A 128-bit vector of [4 x float] containing one of the operands. The lower +/// 32 bits of this operand are used in the comparison. +/// \returns A 128-bit vector of [4 x float] containing the comparison results +/// in the low-order bits. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_cmpnle_ss(__m128 __a, __m128 __b) +{ + return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b); +} + +/// Compares each of the corresponding 32-bit float values of the +/// 128-bit vectors of [4 x float] to determine if the values in the first +/// operand are not less than or equal to those in the second operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCMPNLEPS / CMPNLEPS +/// instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. +/// \param __b +/// A 128-bit vector of [4 x float]. +/// \returns A 128-bit vector of [4 x float] containing the comparison results. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_cmpnle_ps(__m128 __a, __m128 __b) +{ + return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b); +} + +/// Compares two 32-bit float values in the low-order bits of both +/// operands to determine if the value in the first operand is not greater +/// than the corresponding value in the second operand and returns the +/// result of the comparison in the low-order bits of a vector of +/// [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCMPNLTSS / CMPNLTSS +/// instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing one of the operands. The lower +/// 32 bits of this operand are used in the comparison. +/// \param __b +/// A 128-bit vector of [4 x float] containing one of the operands. The lower +/// 32 bits of this operand are used in the comparison. +/// \returns A 128-bit vector of [4 x float] containing the comparison results +/// in the low-order bits. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_cmpngt_ss(__m128 __a, __m128 __b) +{ + return (__m128)__builtin_shufflevector((__v4sf)__a, + (__v4sf)__builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a), + 4, 1, 2, 3); +} + +/// Compares each of the corresponding 32-bit float values of the +/// 128-bit vectors of [4 x float] to determine if the values in the first +/// operand are not greater than those in the second operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCMPNLTPS / CMPNLTPS +/// instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. +/// \param __b +/// A 128-bit vector of [4 x float]. +/// \returns A 128-bit vector of [4 x float] containing the comparison results. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_cmpngt_ps(__m128 __a, __m128 __b) +{ + return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a); +} + +/// Compares two 32-bit float values in the low-order bits of both +/// operands to determine if the value in the first operand is not greater +/// than or equal to the corresponding value in the second operand and +/// returns the result of the comparison in the low-order bits of a vector +/// of [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCMPNLESS / CMPNLESS +/// instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing one of the operands. The lower +/// 32 bits of this operand are used in the comparison. +/// \param __b +/// A 128-bit vector of [4 x float] containing one of the operands. The lower +/// 32 bits of this operand are used in the comparison. +/// \returns A 128-bit vector of [4 x float] containing the comparison results +/// in the low-order bits. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_cmpnge_ss(__m128 __a, __m128 __b) +{ + return (__m128)__builtin_shufflevector((__v4sf)__a, + (__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a), + 4, 1, 2, 3); +} + +/// Compares each of the corresponding 32-bit float values of the +/// 128-bit vectors of [4 x float] to determine if the values in the first +/// operand are not greater than or equal to those in the second operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCMPNLEPS / CMPNLEPS +/// instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. +/// \param __b +/// A 128-bit vector of [4 x float]. +/// \returns A 128-bit vector of [4 x float] containing the comparison results. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_cmpnge_ps(__m128 __a, __m128 __b) +{ + return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a); +} + +/// Compares two 32-bit float values in the low-order bits of both +/// operands to determine if the value in the first operand is ordered with +/// respect to the corresponding value in the second operand and returns the +/// result of the comparison in the low-order bits of a vector of +/// [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCMPORDSS / CMPORDSS +/// instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing one of the operands. The lower +/// 32 bits of this operand are used in the comparison. +/// \param __b +/// A 128-bit vector of [4 x float] containing one of the operands. The lower +/// 32 bits of this operand are used in the comparison. +/// \returns A 128-bit vector of [4 x float] containing the comparison results +/// in the low-order bits. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_cmpord_ss(__m128 __a, __m128 __b) +{ + return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b); +} + +/// Compares each of the corresponding 32-bit float values of the +/// 128-bit vectors of [4 x float] to determine if the values in the first +/// operand are ordered with respect to those in the second operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCMPORDPS / CMPORDPS +/// instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. +/// \param __b +/// A 128-bit vector of [4 x float]. +/// \returns A 128-bit vector of [4 x float] containing the comparison results. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_cmpord_ps(__m128 __a, __m128 __b) +{ + return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b); +} + +/// Compares two 32-bit float values in the low-order bits of both +/// operands to determine if the value in the first operand is unordered +/// with respect to the corresponding value in the second operand and +/// returns the result of the comparison in the low-order bits of a vector +/// of [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCMPUNORDSS / CMPUNORDSS +/// instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float] containing one of the operands. The lower +/// 32 bits of this operand are used in the comparison. +/// \param __b +/// A 128-bit vector of [4 x float] containing one of the operands. The lower +/// 32 bits of this operand are used in the comparison. +/// \returns A 128-bit vector of [4 x float] containing the comparison results +/// in the low-order bits. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_cmpunord_ss(__m128 __a, __m128 __b) +{ + return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b); +} + +/// Compares each of the corresponding 32-bit float values of the +/// 128-bit vectors of [4 x float] to determine if the values in the first +/// operand are unordered with respect to those in the second operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCMPUNORDPS / CMPUNORDPS +/// instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. +/// \param __b +/// A 128-bit vector of [4 x float]. +/// \returns A 128-bit vector of [4 x float] containing the comparison results. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_cmpunord_ps(__m128 __a, __m128 __b) +{ + return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b); +} + +/// Compares two 32-bit float values in the low-order bits of both +/// operands for equality and returns the result of the comparison. +/// +/// If either of the two lower 32-bit values is NaN, 0 is returned. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCOMISS / COMISS +/// instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are +/// used in the comparison. +/// \param __b +/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are +/// used in the comparison. +/// \returns An integer containing the comparison results. If either of the +/// two lower 32-bit values is NaN, 0 is returned. +static __inline__ int __DEFAULT_FN_ATTRS +_mm_comieq_ss(__m128 __a, __m128 __b) +{ + return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b); +} + +/// Compares two 32-bit float values in the low-order bits of both +/// operands to determine if the first operand is less than the second +/// operand and returns the result of the comparison. +/// +/// If either of the two lower 32-bit values is NaN, 0 is returned. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCOMISS / COMISS +/// instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are +/// used in the comparison. +/// \param __b +/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are +/// used in the comparison. +/// \returns An integer containing the comparison results. If either of the two +/// lower 32-bit values is NaN, 0 is returned. +static __inline__ int __DEFAULT_FN_ATTRS +_mm_comilt_ss(__m128 __a, __m128 __b) +{ + return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b); +} + +/// Compares two 32-bit float values in the low-order bits of both +/// operands to determine if the first operand is less than or equal to the +/// second operand and returns the result of the comparison. +/// +/// If either of the two lower 32-bit values is NaN, 0 is returned. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCOMISS / COMISS instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are +/// used in the comparison. +/// \param __b +/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are +/// used in the comparison. +/// \returns An integer containing the comparison results. If either of the two +/// lower 32-bit values is NaN, 0 is returned. +static __inline__ int __DEFAULT_FN_ATTRS +_mm_comile_ss(__m128 __a, __m128 __b) +{ + return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b); +} + +/// Compares two 32-bit float values in the low-order bits of both +/// operands to determine if the first operand is greater than the second +/// operand and returns the result of the comparison. +/// +/// If either of the two lower 32-bit values is NaN, 0 is returned. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCOMISS / COMISS instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are +/// used in the comparison. +/// \param __b +/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are +/// used in the comparison. +/// \returns An integer containing the comparison results. If either of the +/// two lower 32-bit values is NaN, 0 is returned. +static __inline__ int __DEFAULT_FN_ATTRS +_mm_comigt_ss(__m128 __a, __m128 __b) +{ + return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b); +} + +/// Compares two 32-bit float values in the low-order bits of both +/// operands to determine if the first operand is greater than or equal to +/// the second operand and returns the result of the comparison. +/// +/// If either of the two lower 32-bit values is NaN, 0 is returned. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCOMISS / COMISS instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are +/// used in the comparison. +/// \param __b +/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are +/// used in the comparison. +/// \returns An integer containing the comparison results. If either of the two +/// lower 32-bit values is NaN, 0 is returned. +static __inline__ int __DEFAULT_FN_ATTRS +_mm_comige_ss(__m128 __a, __m128 __b) +{ + return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b); +} + +/// Compares two 32-bit float values in the low-order bits of both +/// operands to determine if the first operand is not equal to the second +/// operand and returns the result of the comparison. +/// +/// If either of the two lower 32-bit values is NaN, 1 is returned. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCOMISS / COMISS instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are +/// used in the comparison. +/// \param __b +/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are +/// used in the comparison. +/// \returns An integer containing the comparison results. If either of the +/// two lower 32-bit values is NaN, 1 is returned. +static __inline__ int __DEFAULT_FN_ATTRS +_mm_comineq_ss(__m128 __a, __m128 __b) +{ + return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b); +} + +/// Performs an unordered comparison of two 32-bit float values using +/// the low-order bits of both operands to determine equality and returns +/// the result of the comparison. +/// +/// If either of the two lower 32-bit values is NaN, 0 is returned. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VUCOMISS / UCOMISS instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are +/// used in the comparison. +/// \param __b +/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are +/// used in the comparison. +/// \returns An integer containing the comparison results. If either of the two +/// lower 32-bit values is NaN, 0 is returned. +static __inline__ int __DEFAULT_FN_ATTRS +_mm_ucomieq_ss(__m128 __a, __m128 __b) +{ + return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b); +} + +/// Performs an unordered comparison of two 32-bit float values using +/// the low-order bits of both operands to determine if the first operand is +/// less than the second operand and returns the result of the comparison. +/// +/// If either of the two lower 32-bit values is NaN, 0 is returned. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VUCOMISS / UCOMISS instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are +/// used in the comparison. +/// \param __b +/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are +/// used in the comparison. +/// \returns An integer containing the comparison results. If either of the two +/// lower 32-bit values is NaN, 0 is returned. +static __inline__ int __DEFAULT_FN_ATTRS +_mm_ucomilt_ss(__m128 __a, __m128 __b) +{ + return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b); +} + +/// Performs an unordered comparison of two 32-bit float values using +/// the low-order bits of both operands to determine if the first operand is +/// less than or equal to the second operand and returns the result of the +/// comparison. +/// +/// If either of the two lower 32-bit values is NaN, 0 is returned. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VUCOMISS / UCOMISS instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are +/// used in the comparison. +/// \param __b +/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are +/// used in the comparison. +/// \returns An integer containing the comparison results. If either of the two +/// lower 32-bit values is NaN, 0 is returned. +static __inline__ int __DEFAULT_FN_ATTRS +_mm_ucomile_ss(__m128 __a, __m128 __b) +{ + return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b); +} + +/// Performs an unordered comparison of two 32-bit float values using +/// the low-order bits of both operands to determine if the first operand is +/// greater than the second operand and returns the result of the +/// comparison. +/// +/// If either of the two lower 32-bit values is NaN, 0 is returned. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VUCOMISS / UCOMISS instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are +/// used in the comparison. +/// \param __b +/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are +/// used in the comparison. +/// \returns An integer containing the comparison results. If either of the two +/// lower 32-bit values is NaN, 0 is returned. +static __inline__ int __DEFAULT_FN_ATTRS +_mm_ucomigt_ss(__m128 __a, __m128 __b) +{ + return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b); +} + +/// Performs an unordered comparison of two 32-bit float values using +/// the low-order bits of both operands to determine if the first operand is +/// greater than or equal to the second operand and returns the result of +/// the comparison. +/// +/// If either of the two lower 32-bit values is NaN, 0 is returned. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VUCOMISS / UCOMISS instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are +/// used in the comparison. +/// \param __b +/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are +/// used in the comparison. +/// \returns An integer containing the comparison results. If either of the two +/// lower 32-bit values is NaN, 0 is returned. +static __inline__ int __DEFAULT_FN_ATTRS +_mm_ucomige_ss(__m128 __a, __m128 __b) +{ + return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b); +} + +/// Performs an unordered comparison of two 32-bit float values using +/// the low-order bits of both operands to determine inequality and returns +/// the result of the comparison. +/// +/// If either of the two lower 32-bit values is NaN, 1 is returned. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VUCOMISS / UCOMISS instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are +/// used in the comparison. +/// \param __b +/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are +/// used in the comparison. +/// \returns An integer containing the comparison results. If either of the two +/// lower 32-bit values is NaN, 1 is returned. +static __inline__ int __DEFAULT_FN_ATTRS +_mm_ucomineq_ss(__m128 __a, __m128 __b) +{ + return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b); +} + +/// Converts a float value contained in the lower 32 bits of a vector of +/// [4 x float] into a 32-bit integer. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTSS2SI / CVTSS2SI +/// instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are +/// used in the conversion. +/// \returns A 32-bit integer containing the converted value. +static __inline__ int __DEFAULT_FN_ATTRS +_mm_cvtss_si32(__m128 __a) +{ + return __builtin_ia32_cvtss2si((__v4sf)__a); +} + +/// Converts a float value contained in the lower 32 bits of a vector of +/// [4 x float] into a 32-bit integer. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTSS2SI / CVTSS2SI +/// instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are +/// used in the conversion. +/// \returns A 32-bit integer containing the converted value. +static __inline__ int __DEFAULT_FN_ATTRS +_mm_cvt_ss2si(__m128 __a) +{ + return _mm_cvtss_si32(__a); +} + +#ifdef __x86_64__ + +/// Converts a float value contained in the lower 32 bits of a vector of +/// [4 x float] into a 64-bit integer. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTSS2SI / CVTSS2SI +/// instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are +/// used in the conversion. +/// \returns A 64-bit integer containing the converted value. +static __inline__ long long __DEFAULT_FN_ATTRS +_mm_cvtss_si64(__m128 __a) +{ + return __builtin_ia32_cvtss2si64((__v4sf)__a); +} + +#endif + +/// Converts two low-order float values in a 128-bit vector of +/// [4 x float] into a 64-bit vector of [2 x i32]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the CVTPS2PI instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. +/// \returns A 64-bit integer vector containing the converted values. +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +_mm_cvtps_pi32(__m128 __a) +{ + return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a); +} + +/// Converts two low-order float values in a 128-bit vector of +/// [4 x float] into a 64-bit vector of [2 x i32]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the CVTPS2PI instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. +/// \returns A 64-bit integer vector containing the converted values. +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +_mm_cvt_ps2pi(__m128 __a) +{ + return _mm_cvtps_pi32(__a); +} + +/// Converts a float value contained in the lower 32 bits of a vector of +/// [4 x float] into a 32-bit integer, truncating the result when it is +/// inexact. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTTSS2SI / CVTTSS2SI +/// instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are +/// used in the conversion. +/// \returns A 32-bit integer containing the converted value. +static __inline__ int __DEFAULT_FN_ATTRS +_mm_cvttss_si32(__m128 __a) +{ + return __builtin_ia32_cvttss2si((__v4sf)__a); +} + +/// Converts a float value contained in the lower 32 bits of a vector of +/// [4 x float] into a 32-bit integer, truncating the result when it is +/// inexact. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTTSS2SI / CVTTSS2SI +/// instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are +/// used in the conversion. +/// \returns A 32-bit integer containing the converted value. +static __inline__ int __DEFAULT_FN_ATTRS +_mm_cvtt_ss2si(__m128 __a) +{ + return _mm_cvttss_si32(__a); +} + +#ifdef __x86_64__ +/// Converts a float value contained in the lower 32 bits of a vector of +/// [4 x float] into a 64-bit integer, truncating the result when it is +/// inexact. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTTSS2SI / CVTTSS2SI +/// instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are +/// used in the conversion. +/// \returns A 64-bit integer containing the converted value. +static __inline__ long long __DEFAULT_FN_ATTRS +_mm_cvttss_si64(__m128 __a) +{ + return __builtin_ia32_cvttss2si64((__v4sf)__a); +} +#endif + +/// Converts two low-order float values in a 128-bit vector of +/// [4 x float] into a 64-bit vector of [2 x i32], truncating the result +/// when it is inexact. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the CVTTPS2PI / VTTPS2PI +/// instructions. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. +/// \returns A 64-bit integer vector containing the converted values. +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +_mm_cvttps_pi32(__m128 __a) +{ + return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a); +} + +/// Converts two low-order float values in a 128-bit vector of [4 x +/// float] into a 64-bit vector of [2 x i32], truncating the result when it +/// is inexact. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the CVTTPS2PI instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. +/// \returns A 64-bit integer vector containing the converted values. +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +_mm_cvtt_ps2pi(__m128 __a) +{ + return _mm_cvttps_pi32(__a); +} + +/// Converts a 32-bit signed integer value into a floating point value +/// and writes it to the lower 32 bits of the destination. The remaining +/// higher order elements of the destination vector are copied from the +/// corresponding elements in the first operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTSI2SS / CVTSI2SS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. +/// \param __b +/// A 32-bit signed integer operand containing the value to be converted. +/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the +/// converted value of the second operand. The upper 96 bits are copied from +/// the upper 96 bits of the first operand. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_cvtsi32_ss(__m128 __a, int __b) +{ + __a[0] = __b; + return __a; +} + +/// Converts a 32-bit signed integer value into a floating point value +/// and writes it to the lower 32 bits of the destination. The remaining +/// higher order elements of the destination are copied from the +/// corresponding elements in the first operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTSI2SS / CVTSI2SS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. +/// \param __b +/// A 32-bit signed integer operand containing the value to be converted. +/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the +/// converted value of the second operand. The upper 96 bits are copied from +/// the upper 96 bits of the first operand. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_cvt_si2ss(__m128 __a, int __b) +{ + return _mm_cvtsi32_ss(__a, __b); +} + +#ifdef __x86_64__ + +/// Converts a 64-bit signed integer value into a floating point value +/// and writes it to the lower 32 bits of the destination. The remaining +/// higher order elements of the destination are copied from the +/// corresponding elements in the first operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VCVTSI2SS / CVTSI2SS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. +/// \param __b +/// A 64-bit signed integer operand containing the value to be converted. +/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the +/// converted value of the second operand. The upper 96 bits are copied from +/// the upper 96 bits of the first operand. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_cvtsi64_ss(__m128 __a, long long __b) +{ + __a[0] = __b; + return __a; +} + +#endif + +/// Converts two elements of a 64-bit vector of [2 x i32] into two +/// floating point values and writes them to the lower 64-bits of the +/// destination. The remaining higher order elements of the destination are +/// copied from the corresponding elements in the first operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the CVTPI2PS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. +/// \param __b +/// A 64-bit vector of [2 x i32]. The elements in this vector are converted +/// and written to the corresponding low-order elements in the destination. +/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the +/// converted value of the second operand. The upper 64 bits are copied from +/// the upper 64 bits of the first operand. +static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX +_mm_cvtpi32_ps(__m128 __a, __m64 __b) +{ + return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b); +} + +/// Converts two elements of a 64-bit vector of [2 x i32] into two +/// floating point values and writes them to the lower 64-bits of the +/// destination. The remaining higher order elements of the destination are +/// copied from the corresponding elements in the first operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the CVTPI2PS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. +/// \param __b +/// A 64-bit vector of [2 x i32]. The elements in this vector are converted +/// and written to the corresponding low-order elements in the destination. +/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the +/// converted value from the second operand. The upper 64 bits are copied +/// from the upper 64 bits of the first operand. +static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX +_mm_cvt_pi2ps(__m128 __a, __m64 __b) +{ + return _mm_cvtpi32_ps(__a, __b); +} + +/// Extracts a float value contained in the lower 32 bits of a vector of +/// [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic has no corresponding instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are +/// used in the extraction. +/// \returns A 32-bit float containing the extracted value. +static __inline__ float __DEFAULT_FN_ATTRS +_mm_cvtss_f32(__m128 __a) +{ + return __a[0]; +} + +/// Loads two packed float values from the address \a __p into the +/// high-order bits of a 128-bit vector of [4 x float]. The low-order bits +/// are copied from the low-order bits of the first operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVHPD / MOVHPD instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. Bits [63:0] are written to bits [63:0] +/// of the destination. +/// \param __p +/// A pointer to two packed float values. Bits [63:0] are written to bits +/// [127:64] of the destination. +/// \returns A 128-bit vector of [4 x float] containing the moved values. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_loadh_pi(__m128 __a, const __m64 *__p) +{ + typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8))); + struct __mm_loadh_pi_struct { + __mm_loadh_pi_v2f32 __u; + } __attribute__((__packed__, __may_alias__)); + __mm_loadh_pi_v2f32 __b = ((const struct __mm_loadh_pi_struct*)__p)->__u; + __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1); + return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5); +} + +/// Loads two packed float values from the address \a __p into the +/// low-order bits of a 128-bit vector of [4 x float]. The high-order bits +/// are copied from the high-order bits of the first operand. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVLPD / MOVLPD instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. Bits [127:64] are written to bits +/// [127:64] of the destination. +/// \param __p +/// A pointer to two packed float values. Bits [63:0] are written to bits +/// [63:0] of the destination. +/// \returns A 128-bit vector of [4 x float] containing the moved values. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_loadl_pi(__m128 __a, const __m64 *__p) +{ + typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8))); + struct __mm_loadl_pi_struct { + __mm_loadl_pi_v2f32 __u; + } __attribute__((__packed__, __may_alias__)); + __mm_loadl_pi_v2f32 __b = ((const struct __mm_loadl_pi_struct*)__p)->__u; + __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1); + return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3); +} + +/// Constructs a 128-bit floating-point vector of [4 x float]. The lower +/// 32 bits of the vector are initialized with the single-precision +/// floating-point value loaded from a specified memory location. The upper +/// 96 bits are set to zero. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVSS / MOVSS instruction. +/// +/// \param __p +/// A pointer to a 32-bit memory location containing a single-precision +/// floating-point value. +/// \returns An initialized 128-bit floating-point vector of [4 x float]. The +/// lower 32 bits contain the value loaded from the memory location. The +/// upper 96 bits are set to zero. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_load_ss(const float *__p) +{ + struct __mm_load_ss_struct { + float __u; + } __attribute__((__packed__, __may_alias__)); + float __u = ((const struct __mm_load_ss_struct*)__p)->__u; + return __extension__ (__m128){ __u, 0, 0, 0 }; +} + +/// Loads a 32-bit float value and duplicates it to all four vector +/// elements of a 128-bit vector of [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VBROADCASTSS / MOVSS + shuffling +/// instruction. +/// +/// \param __p +/// A pointer to a float value to be loaded and duplicated. +/// \returns A 128-bit vector of [4 x float] containing the loaded and +/// duplicated values. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_load1_ps(const float *__p) +{ + struct __mm_load1_ps_struct { + float __u; + } __attribute__((__packed__, __may_alias__)); + float __u = ((const struct __mm_load1_ps_struct*)__p)->__u; + return __extension__ (__m128){ __u, __u, __u, __u }; +} + +#define _mm_load_ps1(p) _mm_load1_ps(p) + +/// Loads a 128-bit floating-point vector of [4 x float] from an aligned +/// memory location. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVAPS / MOVAPS instruction. +/// +/// \param __p +/// A pointer to a 128-bit memory location. The address of the memory +/// location has to be 128-bit aligned. +/// \returns A 128-bit vector of [4 x float] containing the loaded values. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_load_ps(const float *__p) +{ + return *(const __m128*)__p; +} + +/// Loads a 128-bit floating-point vector of [4 x float] from an +/// unaligned memory location. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVUPS / MOVUPS instruction. +/// +/// \param __p +/// A pointer to a 128-bit memory location. The address of the memory +/// location does not have to be aligned. +/// \returns A 128-bit vector of [4 x float] containing the loaded values. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_loadu_ps(const float *__p) +{ + struct __loadu_ps { + __m128_u __v; + } __attribute__((__packed__, __may_alias__)); + return ((const struct __loadu_ps*)__p)->__v; +} + +/// Loads four packed float values, in reverse order, from an aligned +/// memory location to 32-bit elements in a 128-bit vector of [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVAPS / MOVAPS + shuffling +/// instruction. +/// +/// \param __p +/// A pointer to a 128-bit memory location. The address of the memory +/// location has to be 128-bit aligned. +/// \returns A 128-bit vector of [4 x float] containing the moved values, loaded +/// in reverse order. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_loadr_ps(const float *__p) +{ + __m128 __a = _mm_load_ps(__p); + return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0); +} + +/// Create a 128-bit vector of [4 x float] with undefined values. +/// +/// \headerfile +/// +/// This intrinsic has no corresponding instruction. +/// +/// \returns A 128-bit vector of [4 x float] containing undefined values. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_undefined_ps(void) +{ + return (__m128)__builtin_ia32_undef128(); +} + +/// Constructs a 128-bit floating-point vector of [4 x float]. The lower +/// 32 bits of the vector are initialized with the specified single-precision +/// floating-point value. The upper 96 bits are set to zero. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVSS / MOVSS instruction. +/// +/// \param __w +/// A single-precision floating-point value used to initialize the lower 32 +/// bits of the result. +/// \returns An initialized 128-bit floating-point vector of [4 x float]. The +/// lower 32 bits contain the value provided in the source operand. The +/// upper 96 bits are set to zero. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_set_ss(float __w) +{ + return __extension__ (__m128){ __w, 0, 0, 0 }; +} + +/// Constructs a 128-bit floating-point vector of [4 x float], with each +/// of the four single-precision floating-point vector elements set to the +/// specified single-precision floating-point value. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPERMILPS / PERMILPS instruction. +/// +/// \param __w +/// A single-precision floating-point value used to initialize each vector +/// element of the result. +/// \returns An initialized 128-bit floating-point vector of [4 x float]. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_set1_ps(float __w) +{ + return __extension__ (__m128){ __w, __w, __w, __w }; +} + +/* Microsoft specific. */ +/// Constructs a 128-bit floating-point vector of [4 x float], with each +/// of the four single-precision floating-point vector elements set to the +/// specified single-precision floating-point value. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPERMILPS / PERMILPS instruction. +/// +/// \param __w +/// A single-precision floating-point value used to initialize each vector +/// element of the result. +/// \returns An initialized 128-bit floating-point vector of [4 x float]. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_set_ps1(float __w) +{ + return _mm_set1_ps(__w); +} + +/// Constructs a 128-bit floating-point vector of [4 x float] +/// initialized with the specified single-precision floating-point values. +/// +/// \headerfile +/// +/// This intrinsic is a utility function and does not correspond to a specific +/// instruction. +/// +/// \param __z +/// A single-precision floating-point value used to initialize bits [127:96] +/// of the result. +/// \param __y +/// A single-precision floating-point value used to initialize bits [95:64] +/// of the result. +/// \param __x +/// A single-precision floating-point value used to initialize bits [63:32] +/// of the result. +/// \param __w +/// A single-precision floating-point value used to initialize bits [31:0] +/// of the result. +/// \returns An initialized 128-bit floating-point vector of [4 x float]. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_set_ps(float __z, float __y, float __x, float __w) +{ + return __extension__ (__m128){ __w, __x, __y, __z }; +} + +/// Constructs a 128-bit floating-point vector of [4 x float], +/// initialized in reverse order with the specified 32-bit single-precision +/// float-point values. +/// +/// \headerfile +/// +/// This intrinsic is a utility function and does not correspond to a specific +/// instruction. +/// +/// \param __z +/// A single-precision floating-point value used to initialize bits [31:0] +/// of the result. +/// \param __y +/// A single-precision floating-point value used to initialize bits [63:32] +/// of the result. +/// \param __x +/// A single-precision floating-point value used to initialize bits [95:64] +/// of the result. +/// \param __w +/// A single-precision floating-point value used to initialize bits [127:96] +/// of the result. +/// \returns An initialized 128-bit floating-point vector of [4 x float]. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_setr_ps(float __z, float __y, float __x, float __w) +{ + return __extension__ (__m128){ __z, __y, __x, __w }; +} + +/// Constructs a 128-bit floating-point vector of [4 x float] initialized +/// to zero. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VXORPS / XORPS instruction. +/// +/// \returns An initialized 128-bit floating-point vector of [4 x float] with +/// all elements set to zero. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_setzero_ps(void) +{ + return __extension__ (__m128){ 0, 0, 0, 0 }; +} + +/// Stores the upper 64 bits of a 128-bit vector of [4 x float] to a +/// memory location. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VPEXTRQ / PEXTRQ instruction. +/// +/// \param __p +/// A pointer to a 64-bit memory location. +/// \param __a +/// A 128-bit vector of [4 x float] containing the values to be stored. +static __inline__ void __DEFAULT_FN_ATTRS +_mm_storeh_pi(__m64 *__p, __m128 __a) +{ + typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8))); + struct __mm_storeh_pi_struct { + __mm_storeh_pi_v2f32 __u; + } __attribute__((__packed__, __may_alias__)); + ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 2, 3); +} + +/// Stores the lower 64 bits of a 128-bit vector of [4 x float] to a +/// memory location. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVLPS / MOVLPS instruction. +/// +/// \param __p +/// A pointer to a memory location that will receive the float values. +/// \param __a +/// A 128-bit vector of [4 x float] containing the values to be stored. +static __inline__ void __DEFAULT_FN_ATTRS +_mm_storel_pi(__m64 *__p, __m128 __a) +{ + typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8))); + struct __mm_storeh_pi_struct { + __mm_storeh_pi_v2f32 __u; + } __attribute__((__packed__, __may_alias__)); + ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 0, 1); +} + +/// Stores the lower 32 bits of a 128-bit vector of [4 x float] to a +/// memory location. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVSS / MOVSS instruction. +/// +/// \param __p +/// A pointer to a 32-bit memory location. +/// \param __a +/// A 128-bit vector of [4 x float] containing the value to be stored. +static __inline__ void __DEFAULT_FN_ATTRS +_mm_store_ss(float *__p, __m128 __a) +{ + struct __mm_store_ss_struct { + float __u; + } __attribute__((__packed__, __may_alias__)); + ((struct __mm_store_ss_struct*)__p)->__u = __a[0]; +} + +/// Stores a 128-bit vector of [4 x float] to an unaligned memory +/// location. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVUPS / MOVUPS instruction. +/// +/// \param __p +/// A pointer to a 128-bit memory location. The address of the memory +/// location does not have to be aligned. +/// \param __a +/// A 128-bit vector of [4 x float] containing the values to be stored. +static __inline__ void __DEFAULT_FN_ATTRS +_mm_storeu_ps(float *__p, __m128 __a) +{ + struct __storeu_ps { + __m128_u __v; + } __attribute__((__packed__, __may_alias__)); + ((struct __storeu_ps*)__p)->__v = __a; +} + +/// Stores a 128-bit vector of [4 x float] into an aligned memory +/// location. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVAPS / MOVAPS instruction. +/// +/// \param __p +/// A pointer to a 128-bit memory location. The address of the memory +/// location has to be 16-byte aligned. +/// \param __a +/// A 128-bit vector of [4 x float] containing the values to be stored. +static __inline__ void __DEFAULT_FN_ATTRS +_mm_store_ps(float *__p, __m128 __a) +{ + *(__m128*)__p = __a; +} + +/// Stores the lower 32 bits of a 128-bit vector of [4 x float] into +/// four contiguous elements in an aligned memory location. +/// +/// \headerfile +/// +/// This intrinsic corresponds to VMOVAPS / MOVAPS + shuffling +/// instruction. +/// +/// \param __p +/// A pointer to a 128-bit memory location. +/// \param __a +/// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each +/// of the four contiguous elements pointed by \a __p. +static __inline__ void __DEFAULT_FN_ATTRS +_mm_store1_ps(float *__p, __m128 __a) +{ + __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0); + _mm_store_ps(__p, __a); +} + +/// Stores the lower 32 bits of a 128-bit vector of [4 x float] into +/// four contiguous elements in an aligned memory location. +/// +/// \headerfile +/// +/// This intrinsic corresponds to VMOVAPS / MOVAPS + shuffling +/// instruction. +/// +/// \param __p +/// A pointer to a 128-bit memory location. +/// \param __a +/// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each +/// of the four contiguous elements pointed by \a __p. +static __inline__ void __DEFAULT_FN_ATTRS +_mm_store_ps1(float *__p, __m128 __a) +{ + _mm_store1_ps(__p, __a); +} + +/// Stores float values from a 128-bit vector of [4 x float] to an +/// aligned memory location in reverse order. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVAPS / MOVAPS + shuffling +/// instruction. +/// +/// \param __p +/// A pointer to a 128-bit memory location. The address of the memory +/// location has to be 128-bit aligned. +/// \param __a +/// A 128-bit vector of [4 x float] containing the values to be stored. +static __inline__ void __DEFAULT_FN_ATTRS +_mm_storer_ps(float *__p, __m128 __a) +{ + __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0); + _mm_store_ps(__p, __a); +} + +#define _MM_HINT_ET0 7 +#define _MM_HINT_ET1 6 +#define _MM_HINT_T0 3 +#define _MM_HINT_T1 2 +#define _MM_HINT_T2 1 +#define _MM_HINT_NTA 0 + +#ifndef _MSC_VER +/* FIXME: We have to #define this because "sel" must be a constant integer, and + Sema doesn't do any form of constant propagation yet. */ + +/// Loads one cache line of data from the specified address to a location +/// closer to the processor. +/// +/// \headerfile +/// +/// \code +/// void _mm_prefetch(const void * a, const int sel); +/// \endcode +/// +/// This intrinsic corresponds to the PREFETCHNTA instruction. +/// +/// \param a +/// A pointer to a memory location containing a cache line of data. +/// \param sel +/// A predefined integer constant specifying the type of prefetch +/// operation: \n +/// _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint. The +/// PREFETCHNTA instruction will be generated. \n +/// _MM_HINT_T0: Move data using the T0 hint. The PREFETCHT0 instruction will +/// be generated. \n +/// _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will +/// be generated. \n +/// _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will +/// be generated. +#define _mm_prefetch(a, sel) (__builtin_prefetch((const void *)(a), \ + ((sel) >> 2) & 1, (sel) & 0x3)) +#endif + +/// Stores a 64-bit integer in the specified aligned memory location. To +/// minimize caching, the data is flagged as non-temporal (unlikely to be +/// used again soon). +/// +/// \headerfile +/// +/// This intrinsic corresponds to the MOVNTQ instruction. +/// +/// \param __p +/// A pointer to an aligned memory location used to store the register value. +/// \param __a +/// A 64-bit integer containing the value to be stored. +static __inline__ void __DEFAULT_FN_ATTRS_MMX +_mm_stream_pi(__m64 *__p, __m64 __a) +{ + __builtin_ia32_movntq(__p, __a); +} + +/// Moves packed float values from a 128-bit vector of [4 x float] to a +/// 128-bit aligned memory location. To minimize caching, the data is flagged +/// as non-temporal (unlikely to be used again soon). +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVNTPS / MOVNTPS instruction. +/// +/// \param __p +/// A pointer to a 128-bit aligned memory location that will receive the +/// single-precision floating-point values. +/// \param __a +/// A 128-bit vector of [4 x float] containing the values to be moved. +static __inline__ void __DEFAULT_FN_ATTRS +_mm_stream_ps(float *__p, __m128 __a) +{ + __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p); +} + +#if defined(__cplusplus) +extern "C" { +#endif + +/// Forces strong memory ordering (serialization) between store +/// instructions preceding this instruction and store instructions following +/// this instruction, ensuring the system completes all previous stores +/// before executing subsequent stores. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the SFENCE instruction. +/// +void _mm_sfence(void); + +#if defined(__cplusplus) +} // extern "C" +#endif + +/// Extracts 16-bit element from a 64-bit vector of [4 x i16] and +/// returns it, as specified by the immediate integer operand. +/// +/// \headerfile +/// +/// \code +/// int _mm_extract_pi16(__m64 a, int n); +/// \endcode +/// +/// This intrinsic corresponds to the VPEXTRW / PEXTRW instruction. +/// +/// \param a +/// A 64-bit vector of [4 x i16]. +/// \param n +/// An immediate integer operand that determines which bits are extracted: \n +/// 0: Bits [15:0] are copied to the destination. \n +/// 1: Bits [31:16] are copied to the destination. \n +/// 2: Bits [47:32] are copied to the destination. \n +/// 3: Bits [63:48] are copied to the destination. +/// \returns A 16-bit integer containing the extracted 16 bits of packed data. +#define _mm_extract_pi16(a, n) \ + ((int)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n)) + +/// Copies data from the 64-bit vector of [4 x i16] to the destination, +/// and inserts the lower 16-bits of an integer operand at the 16-bit offset +/// specified by the immediate operand \a n. +/// +/// \headerfile +/// +/// \code +/// __m64 _mm_insert_pi16(__m64 a, int d, int n); +/// \endcode +/// +/// This intrinsic corresponds to the PINSRW instruction. +/// +/// \param a +/// A 64-bit vector of [4 x i16]. +/// \param d +/// An integer. The lower 16-bit value from this operand is written to the +/// destination at the offset specified by operand \a n. +/// \param n +/// An immediate integer operant that determines which the bits to be used +/// in the destination. \n +/// 0: Bits [15:0] are copied to the destination. \n +/// 1: Bits [31:16] are copied to the destination. \n +/// 2: Bits [47:32] are copied to the destination. \n +/// 3: Bits [63:48] are copied to the destination. \n +/// The remaining bits in the destination are copied from the corresponding +/// bits in operand \a a. +/// \returns A 64-bit integer vector containing the copied packed data from the +/// operands. +#define _mm_insert_pi16(a, d, n) \ + ((__m64)__builtin_ia32_vec_set_v4hi((__v4hi)a, (int)d, (int)n)) + +/// Compares each of the corresponding packed 16-bit integer values of +/// the 64-bit integer vectors, and writes the greater value to the +/// corresponding bits in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PMAXSW instruction. +/// +/// \param __a +/// A 64-bit integer vector containing one of the source operands. +/// \param __b +/// A 64-bit integer vector containing one of the source operands. +/// \returns A 64-bit integer vector containing the comparison results. +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +_mm_max_pi16(__m64 __a, __m64 __b) +{ + return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b); +} + +/// Compares each of the corresponding packed 8-bit unsigned integer +/// values of the 64-bit integer vectors, and writes the greater value to the +/// corresponding bits in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PMAXUB instruction. +/// +/// \param __a +/// A 64-bit integer vector containing one of the source operands. +/// \param __b +/// A 64-bit integer vector containing one of the source operands. +/// \returns A 64-bit integer vector containing the comparison results. +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +_mm_max_pu8(__m64 __a, __m64 __b) +{ + return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b); +} + +/// Compares each of the corresponding packed 16-bit integer values of +/// the 64-bit integer vectors, and writes the lesser value to the +/// corresponding bits in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PMINSW instruction. +/// +/// \param __a +/// A 64-bit integer vector containing one of the source operands. +/// \param __b +/// A 64-bit integer vector containing one of the source operands. +/// \returns A 64-bit integer vector containing the comparison results. +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +_mm_min_pi16(__m64 __a, __m64 __b) +{ + return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b); +} + +/// Compares each of the corresponding packed 8-bit unsigned integer +/// values of the 64-bit integer vectors, and writes the lesser value to the +/// corresponding bits in the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PMINUB instruction. +/// +/// \param __a +/// A 64-bit integer vector containing one of the source operands. +/// \param __b +/// A 64-bit integer vector containing one of the source operands. +/// \returns A 64-bit integer vector containing the comparison results. +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +_mm_min_pu8(__m64 __a, __m64 __b) +{ + return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b); +} + +/// Takes the most significant bit from each 8-bit element in a 64-bit +/// integer vector to create an 8-bit mask value. Zero-extends the value to +/// 32-bit integer and writes it to the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PMOVMSKB instruction. +/// +/// \param __a +/// A 64-bit integer vector containing the values with bits to be extracted. +/// \returns The most significant bit from each 8-bit element in \a __a, +/// written to bits [7:0]. +static __inline__ int __DEFAULT_FN_ATTRS_MMX +_mm_movemask_pi8(__m64 __a) +{ + return __builtin_ia32_pmovmskb((__v8qi)__a); +} + +/// Multiplies packed 16-bit unsigned integer values and writes the +/// high-order 16 bits of each 32-bit product to the corresponding bits in +/// the destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PMULHUW instruction. +/// +/// \param __a +/// A 64-bit integer vector containing one of the source operands. +/// \param __b +/// A 64-bit integer vector containing one of the source operands. +/// \returns A 64-bit integer vector containing the products of both operands. +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +_mm_mulhi_pu16(__m64 __a, __m64 __b) +{ + return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b); +} + +/// Shuffles the 4 16-bit integers from a 64-bit integer vector to the +/// destination, as specified by the immediate value operand. +/// +/// \headerfile +/// +/// \code +/// __m64 _mm_shuffle_pi16(__m64 a, const int n); +/// \endcode +/// +/// This intrinsic corresponds to the PSHUFW instruction. +/// +/// \param a +/// A 64-bit integer vector containing the values to be shuffled. +/// \param n +/// An immediate value containing an 8-bit value specifying which elements to +/// copy from \a a. The destinations within the 64-bit destination are +/// assigned values as follows: \n +/// Bits [1:0] are used to assign values to bits [15:0] in the +/// destination. \n +/// Bits [3:2] are used to assign values to bits [31:16] in the +/// destination. \n +/// Bits [5:4] are used to assign values to bits [47:32] in the +/// destination. \n +/// Bits [7:6] are used to assign values to bits [63:48] in the +/// destination. \n +/// Bit value assignments: \n +/// 00: assigned from bits [15:0] of \a a. \n +/// 01: assigned from bits [31:16] of \a a. \n +/// 10: assigned from bits [47:32] of \a a. \n +/// 11: assigned from bits [63:48] of \a a. +/// \returns A 64-bit integer vector containing the shuffled values. +#define _mm_shuffle_pi16(a, n) \ + ((__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n))) + +/// Conditionally copies the values from each 8-bit element in the first +/// 64-bit integer vector operand to the specified memory location, as +/// specified by the most significant bit in the corresponding element in the +/// second 64-bit integer vector operand. +/// +/// To minimize caching, the data is flagged as non-temporal +/// (unlikely to be used again soon). +/// +/// \headerfile +/// +/// This intrinsic corresponds to the MASKMOVQ instruction. +/// +/// \param __d +/// A 64-bit integer vector containing the values with elements to be copied. +/// \param __n +/// A 64-bit integer vector operand. The most significant bit from each 8-bit +/// element determines whether the corresponding element in operand \a __d +/// is copied. If the most significant bit of a given element is 1, the +/// corresponding element in operand \a __d is copied. +/// \param __p +/// A pointer to a 64-bit memory location that will receive the conditionally +/// copied integer values. The address of the memory location does not have +/// to be aligned. +static __inline__ void __DEFAULT_FN_ATTRS_MMX +_mm_maskmove_si64(__m64 __d, __m64 __n, char *__p) +{ + __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p); +} + +/// Computes the rounded averages of the packed unsigned 8-bit integer +/// values and writes the averages to the corresponding bits in the +/// destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PAVGB instruction. +/// +/// \param __a +/// A 64-bit integer vector containing one of the source operands. +/// \param __b +/// A 64-bit integer vector containing one of the source operands. +/// \returns A 64-bit integer vector containing the averages of both operands. +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +_mm_avg_pu8(__m64 __a, __m64 __b) +{ + return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b); +} + +/// Computes the rounded averages of the packed unsigned 16-bit integer +/// values and writes the averages to the corresponding bits in the +/// destination. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PAVGW instruction. +/// +/// \param __a +/// A 64-bit integer vector containing one of the source operands. +/// \param __b +/// A 64-bit integer vector containing one of the source operands. +/// \returns A 64-bit integer vector containing the averages of both operands. +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +_mm_avg_pu16(__m64 __a, __m64 __b) +{ + return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b); +} + +/// Subtracts the corresponding 8-bit unsigned integer values of the two +/// 64-bit vector operands and computes the absolute value for each of the +/// difference. Then sum of the 8 absolute differences is written to the +/// bits [15:0] of the destination; the remaining bits [63:16] are cleared. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the PSADBW instruction. +/// +/// \param __a +/// A 64-bit integer vector containing one of the source operands. +/// \param __b +/// A 64-bit integer vector containing one of the source operands. +/// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the +/// sets of absolute differences between both operands. The upper bits are +/// cleared. +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +_mm_sad_pu8(__m64 __a, __m64 __b) +{ + return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b); +} + +#if defined(__cplusplus) +extern "C" { +#endif + +/// Returns the contents of the MXCSR register as a 32-bit unsigned +/// integer value. +/// +/// There are several groups of macros associated with this +/// intrinsic, including: +///
    +///
  • +/// For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO, +/// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW, +/// _MM_EXCEPT_INEXACT. There is a convenience wrapper +/// _MM_GET_EXCEPTION_STATE(). +///
  • +///
  • +/// For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW, +/// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT. +/// There is a convenience wrapper _MM_GET_EXCEPTION_MASK(). +///
  • +///
  • +/// For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, +/// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper +/// _MM_GET_ROUNDING_MODE(). +///
  • +///
  • +/// For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF. +/// There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE(). +///
  • +///
  • +/// For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON, +/// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper +/// _MM_GET_DENORMALS_ZERO_MODE(). +///
  • +///
+/// +/// For example, the following expression checks if an overflow exception has +/// occurred: +/// \code +/// ( _mm_getcsr() & _MM_EXCEPT_OVERFLOW ) +/// \endcode +/// +/// The following expression gets the current rounding mode: +/// \code +/// _MM_GET_ROUNDING_MODE() +/// \endcode +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VSTMXCSR / STMXCSR instruction. +/// +/// \returns A 32-bit unsigned integer containing the contents of the MXCSR +/// register. +unsigned int _mm_getcsr(void); + +/// Sets the MXCSR register with the 32-bit unsigned integer value. +/// +/// There are several groups of macros associated with this intrinsic, +/// including: +///
    +///
  • +/// For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO, +/// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW, +/// _MM_EXCEPT_INEXACT. There is a convenience wrapper +/// _MM_SET_EXCEPTION_STATE(x) where x is one of these macros. +///
  • +///
  • +/// For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW, +/// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT. +/// There is a convenience wrapper _MM_SET_EXCEPTION_MASK(x) where x is one +/// of these macros. +///
  • +///
  • +/// For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, +/// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper +/// _MM_SET_ROUNDING_MODE(x) where x is one of these macros. +///
  • +///
  • +/// For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF. +/// There is a convenience wrapper _MM_SET_FLUSH_ZERO_MODE(x) where x is +/// one of these macros. +///
  • +///
  • +/// For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON, +/// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper +/// _MM_SET_DENORMALS_ZERO_MODE(x) where x is one of these macros. +///
  • +///
+/// +/// For example, the following expression causes subsequent floating-point +/// operations to round up: +/// _mm_setcsr(_mm_getcsr() | _MM_ROUND_UP) +/// +/// The following example sets the DAZ and FTZ flags: +/// \code +/// void setFlags() { +/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); +/// _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); +/// } +/// \endcode +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VLDMXCSR / LDMXCSR instruction. +/// +/// \param __i +/// A 32-bit unsigned integer value to be written to the MXCSR register. +void _mm_setcsr(unsigned int __i); + +#if defined(__cplusplus) +} // extern "C" +#endif + +/// Selects 4 float values from the 128-bit operands of [4 x float], as +/// specified by the immediate value operand. +/// +/// \headerfile +/// +/// \code +/// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask); +/// \endcode +/// +/// This intrinsic corresponds to the VSHUFPS / SHUFPS instruction. +/// +/// \param a +/// A 128-bit vector of [4 x float]. +/// \param b +/// A 128-bit vector of [4 x float]. +/// \param mask +/// An immediate value containing an 8-bit value specifying which elements to +/// copy from \a a and \a b. \n +/// Bits [3:0] specify the values copied from operand \a a. \n +/// Bits [7:4] specify the values copied from operand \a b. \n +/// The destinations within the 128-bit destination are assigned values as +/// follows: \n +/// Bits [1:0] are used to assign values to bits [31:0] in the +/// destination. \n +/// Bits [3:2] are used to assign values to bits [63:32] in the +/// destination. \n +/// Bits [5:4] are used to assign values to bits [95:64] in the +/// destination. \n +/// Bits [7:6] are used to assign values to bits [127:96] in the +/// destination. \n +/// Bit value assignments: \n +/// 00: Bits [31:0] copied from the specified operand. \n +/// 01: Bits [63:32] copied from the specified operand. \n +/// 10: Bits [95:64] copied from the specified operand. \n +/// 11: Bits [127:96] copied from the specified operand. +/// \returns A 128-bit vector of [4 x float] containing the shuffled values. +#define _mm_shuffle_ps(a, b, mask) \ + ((__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \ + (int)(mask))) + +/// Unpacks the high-order (index 2,3) values from two 128-bit vectors of +/// [4 x float] and interleaves them into a 128-bit vector of [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VUNPCKHPS / UNPCKHPS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. \n +/// Bits [95:64] are written to bits [31:0] of the destination. \n +/// Bits [127:96] are written to bits [95:64] of the destination. +/// \param __b +/// A 128-bit vector of [4 x float]. +/// Bits [95:64] are written to bits [63:32] of the destination. \n +/// Bits [127:96] are written to bits [127:96] of the destination. +/// \returns A 128-bit vector of [4 x float] containing the interleaved values. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_unpackhi_ps(__m128 __a, __m128 __b) +{ + return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7); +} + +/// Unpacks the low-order (index 0,1) values from two 128-bit vectors of +/// [4 x float] and interleaves them into a 128-bit vector of [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VUNPCKLPS / UNPCKLPS instruction. +/// +/// \param __a +/// A 128-bit vector of [4 x float]. \n +/// Bits [31:0] are written to bits [31:0] of the destination. \n +/// Bits [63:32] are written to bits [95:64] of the destination. +/// \param __b +/// A 128-bit vector of [4 x float]. \n +/// Bits [31:0] are written to bits [63:32] of the destination. \n +/// Bits [63:32] are written to bits [127:96] of the destination. +/// \returns A 128-bit vector of [4 x float] containing the interleaved values. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_unpacklo_ps(__m128 __a, __m128 __b) +{ + return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5); +} + +/// Constructs a 128-bit floating-point vector of [4 x float]. The lower +/// 32 bits are set to the lower 32 bits of the second parameter. The upper +/// 96 bits are set to the upper 96 bits of the first parameter. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VBLENDPS / BLENDPS / MOVSS +/// instruction. +/// +/// \param __a +/// A 128-bit floating-point vector of [4 x float]. The upper 96 bits are +/// written to the upper 96 bits of the result. +/// \param __b +/// A 128-bit floating-point vector of [4 x float]. The lower 32 bits are +/// written to the lower 32 bits of the result. +/// \returns A 128-bit floating-point vector of [4 x float]. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_move_ss(__m128 __a, __m128 __b) +{ + __a[0] = __b[0]; + return __a; +} + +/// Constructs a 128-bit floating-point vector of [4 x float]. The lower +/// 64 bits are set to the upper 64 bits of the second parameter. The upper +/// 64 bits are set to the upper 64 bits of the first parameter. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VUNPCKHPD / UNPCKHPD instruction. +/// +/// \param __a +/// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are +/// written to the upper 64 bits of the result. +/// \param __b +/// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are +/// written to the lower 64 bits of the result. +/// \returns A 128-bit floating-point vector of [4 x float]. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_movehl_ps(__m128 __a, __m128 __b) +{ + return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3); +} + +/// Constructs a 128-bit floating-point vector of [4 x float]. The lower +/// 64 bits are set to the lower 64 bits of the first parameter. The upper +/// 64 bits are set to the lower 64 bits of the second parameter. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VUNPCKLPD / UNPCKLPD instruction. +/// +/// \param __a +/// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are +/// written to the lower 64 bits of the result. +/// \param __b +/// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are +/// written to the upper 64 bits of the result. +/// \returns A 128-bit floating-point vector of [4 x float]. +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_movelh_ps(__m128 __a, __m128 __b) +{ + return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5); +} + +/// Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x +/// float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the CVTPI2PS + COMPOSITE instruction. +/// +/// \param __a +/// A 64-bit vector of [4 x i16]. The elements of the destination are copied +/// from the corresponding elements in this operand. +/// \returns A 128-bit vector of [4 x float] containing the copied and converted +/// values from the operand. +static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX +_mm_cvtpi16_ps(__m64 __a) +{ + __m64 __b, __c; + __m128 __r; + + __b = _mm_setzero_si64(); + __b = _mm_cmpgt_pi16(__b, __a); + __c = _mm_unpackhi_pi16(__a, __b); + __r = _mm_setzero_ps(); + __r = _mm_cvtpi32_ps(__r, __c); + __r = _mm_movelh_ps(__r, __r); + __c = _mm_unpacklo_pi16(__a, __b); + __r = _mm_cvtpi32_ps(__r, __c); + + return __r; +} + +/// Converts a 64-bit vector of 16-bit unsigned integer values into a +/// 128-bit vector of [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the CVTPI2PS + COMPOSITE instruction. +/// +/// \param __a +/// A 64-bit vector of 16-bit unsigned integer values. The elements of the +/// destination are copied from the corresponding elements in this operand. +/// \returns A 128-bit vector of [4 x float] containing the copied and converted +/// values from the operand. +static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX +_mm_cvtpu16_ps(__m64 __a) +{ + __m64 __b, __c; + __m128 __r; + + __b = _mm_setzero_si64(); + __c = _mm_unpackhi_pi16(__a, __b); + __r = _mm_setzero_ps(); + __r = _mm_cvtpi32_ps(__r, __c); + __r = _mm_movelh_ps(__r, __r); + __c = _mm_unpacklo_pi16(__a, __b); + __r = _mm_cvtpi32_ps(__r, __c); + + return __r; +} + +/// Converts the lower four 8-bit values from a 64-bit vector of [8 x i8] +/// into a 128-bit vector of [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the CVTPI2PS + COMPOSITE instruction. +/// +/// \param __a +/// A 64-bit vector of [8 x i8]. The elements of the destination are copied +/// from the corresponding lower 4 elements in this operand. +/// \returns A 128-bit vector of [4 x float] containing the copied and converted +/// values from the operand. +static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX +_mm_cvtpi8_ps(__m64 __a) +{ + __m64 __b; + + __b = _mm_setzero_si64(); + __b = _mm_cmpgt_pi8(__b, __a); + __b = _mm_unpacklo_pi8(__a, __b); + + return _mm_cvtpi16_ps(__b); +} + +/// Converts the lower four unsigned 8-bit integer values from a 64-bit +/// vector of [8 x u8] into a 128-bit vector of [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the CVTPI2PS + COMPOSITE instruction. +/// +/// \param __a +/// A 64-bit vector of unsigned 8-bit integer values. The elements of the +/// destination are copied from the corresponding lower 4 elements in this +/// operand. +/// \returns A 128-bit vector of [4 x float] containing the copied and converted +/// values from the source operand. +static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX +_mm_cvtpu8_ps(__m64 __a) +{ + __m64 __b; + + __b = _mm_setzero_si64(); + __b = _mm_unpacklo_pi8(__a, __b); + + return _mm_cvtpi16_ps(__b); +} + +/// Converts the two 32-bit signed integer values from each 64-bit vector +/// operand of [2 x i32] into a 128-bit vector of [4 x float]. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the CVTPI2PS + COMPOSITE instruction. +/// +/// \param __a +/// A 64-bit vector of [2 x i32]. The lower elements of the destination are +/// copied from the elements in this operand. +/// \param __b +/// A 64-bit vector of [2 x i32]. The upper elements of the destination are +/// copied from the elements in this operand. +/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the +/// copied and converted values from the first operand. The upper 64 bits +/// contain the copied and converted values from the second operand. +static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX +_mm_cvtpi32x2_ps(__m64 __a, __m64 __b) +{ + __m128 __c; + + __c = _mm_setzero_ps(); + __c = _mm_cvtpi32_ps(__c, __b); + __c = _mm_movelh_ps(__c, __c); + + return _mm_cvtpi32_ps(__c, __a); +} + +/// Converts each single-precision floating-point element of a 128-bit +/// floating-point vector of [4 x float] into a 16-bit signed integer, and +/// packs the results into a 64-bit integer vector of [4 x i16]. +/// +/// If the floating-point element is NaN or infinity, or if the +/// floating-point element is greater than 0x7FFFFFFF or less than -0x8000, +/// it is converted to 0x8000. Otherwise if the floating-point element is +/// greater than 0x7FFF, it is converted to 0x7FFF. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the CVTPS2PI + COMPOSITE instruction. +/// +/// \param __a +/// A 128-bit floating-point vector of [4 x float]. +/// \returns A 64-bit integer vector of [4 x i16] containing the converted +/// values. +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +_mm_cvtps_pi16(__m128 __a) +{ + __m64 __b, __c; + + __b = _mm_cvtps_pi32(__a); + __a = _mm_movehl_ps(__a, __a); + __c = _mm_cvtps_pi32(__a); + + return _mm_packs_pi32(__b, __c); +} + +/// Converts each single-precision floating-point element of a 128-bit +/// floating-point vector of [4 x float] into an 8-bit signed integer, and +/// packs the results into the lower 32 bits of a 64-bit integer vector of +/// [8 x i8]. The upper 32 bits of the vector are set to 0. +/// +/// If the floating-point element is NaN or infinity, or if the +/// floating-point element is greater than 0x7FFFFFFF or less than -0x80, it +/// is converted to 0x80. Otherwise if the floating-point element is greater +/// than 0x7F, it is converted to 0x7F. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the CVTPS2PI + COMPOSITE instruction. +/// +/// \param __a +/// 128-bit floating-point vector of [4 x float]. +/// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the +/// converted values and the uppper 32 bits are set to zero. +static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX +_mm_cvtps_pi8(__m128 __a) +{ + __m64 __b, __c; + + __b = _mm_cvtps_pi16(__a); + __c = _mm_setzero_si64(); + + return _mm_packs_pi16(__b, __c); +} + +/// Extracts the sign bits from each single-precision floating-point +/// element of a 128-bit floating-point vector of [4 x float] and returns the +/// sign bits in bits [0:3] of the result. Bits [31:4] of the result are set +/// to zero. +/// +/// \headerfile +/// +/// This intrinsic corresponds to the VMOVMSKPS / MOVMSKPS instruction. +/// +/// \param __a +/// A 128-bit floating-point vector of [4 x float]. +/// \returns A 32-bit integer value. Bits [3:0] contain the sign bits from each +/// single-precision floating-point element of the parameter. Bits [31:4] are +/// set to zero. +static __inline__ int __DEFAULT_FN_ATTRS +_mm_movemask_ps(__m128 __a) +{ + return __builtin_ia32_movmskps((__v4sf)__a); +} + + +#define _MM_ALIGN16 __attribute__((aligned(16))) + +#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w)) + +#define _MM_EXCEPT_INVALID (0x0001U) +#define _MM_EXCEPT_DENORM (0x0002U) +#define _MM_EXCEPT_DIV_ZERO (0x0004U) +#define _MM_EXCEPT_OVERFLOW (0x0008U) +#define _MM_EXCEPT_UNDERFLOW (0x0010U) +#define _MM_EXCEPT_INEXACT (0x0020U) +#define _MM_EXCEPT_MASK (0x003fU) + +#define _MM_MASK_INVALID (0x0080U) +#define _MM_MASK_DENORM (0x0100U) +#define _MM_MASK_DIV_ZERO (0x0200U) +#define _MM_MASK_OVERFLOW (0x0400U) +#define _MM_MASK_UNDERFLOW (0x0800U) +#define _MM_MASK_INEXACT (0x1000U) +#define _MM_MASK_MASK (0x1f80U) + +#define _MM_ROUND_NEAREST (0x0000U) +#define _MM_ROUND_DOWN (0x2000U) +#define _MM_ROUND_UP (0x4000U) +#define _MM_ROUND_TOWARD_ZERO (0x6000U) +#define _MM_ROUND_MASK (0x6000U) + +#define _MM_FLUSH_ZERO_MASK (0x8000U) +#define _MM_FLUSH_ZERO_ON (0x8000U) +#define _MM_FLUSH_ZERO_OFF (0x0000U) + +#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK) +#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK) +#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK) +#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK) + +#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x))) +#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x))) +#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x))) +#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x))) + +#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ +do { \ + __m128 tmp3, tmp2, tmp1, tmp0; \ + tmp0 = _mm_unpacklo_ps((row0), (row1)); \ + tmp2 = _mm_unpacklo_ps((row2), (row3)); \ + tmp1 = _mm_unpackhi_ps((row0), (row1)); \ + tmp3 = _mm_unpackhi_ps((row2), (row3)); \ + (row0) = _mm_movelh_ps(tmp0, tmp2); \ + (row1) = _mm_movehl_ps(tmp2, tmp0); \ + (row2) = _mm_movelh_ps(tmp1, tmp3); \ + (row3) = _mm_movehl_ps(tmp3, tmp1); \ +} while (0) + +/* Aliases for compatibility. */ +#define _m_pextrw _mm_extract_pi16 +#define _m_pinsrw _mm_insert_pi16 +#define _m_pmaxsw _mm_max_pi16 +#define _m_pmaxub _mm_max_pu8 +#define _m_pminsw _mm_min_pi16 +#define _m_pminub _mm_min_pu8 +#define _m_pmovmskb _mm_movemask_pi8 +#define _m_pmulhuw _mm_mulhi_pu16 +#define _m_pshufw _mm_shuffle_pi16 +#define _m_maskmovq _mm_maskmove_si64 +#define _m_pavgb _mm_avg_pu8 +#define _m_pavgw _mm_avg_pu16 +#define _m_psadbw _mm_sad_pu8 +#define _m_ _mm_ +#define _m_ _mm_ + +#undef __DEFAULT_FN_ATTRS +#undef __DEFAULT_FN_ATTRS_MMX + +/* Ugly hack for backwards-compatibility (compatible with gcc) */ +#if defined(__SSE2__) && !__building_module(_Builtin_intrinsics) +#include +#endif + +#endif /* __XMMINTRIN_H */ diff --git a/include-llvm/xopintrin.h b/include-llvm/xopintrin.h new file mode 100644 index 0000000..976cdf4 --- /dev/null +++ b/include-llvm/xopintrin.h @@ -0,0 +1,770 @@ +/*===---- xopintrin.h - XOP intrinsics -------------------------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __X86INTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __XOPINTRIN_H +#define __XOPINTRIN_H + +#include + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("xop"), __min_vector_width__(128))) +#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("xop"), __min_vector_width__(256))) + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_maccs_epi16(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i)__builtin_ia32_vpmacssww((__v8hi)__A, (__v8hi)__B, (__v8hi)__C); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_macc_epi16(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i)__builtin_ia32_vpmacsww((__v8hi)__A, (__v8hi)__B, (__v8hi)__C); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_maccsd_epi16(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i)__builtin_ia32_vpmacsswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_maccd_epi16(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i)__builtin_ia32_vpmacswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_maccs_epi32(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i)__builtin_ia32_vpmacssdd((__v4si)__A, (__v4si)__B, (__v4si)__C); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_macc_epi32(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i)__builtin_ia32_vpmacsdd((__v4si)__A, (__v4si)__B, (__v4si)__C); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_maccslo_epi32(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i)__builtin_ia32_vpmacssdql((__v4si)__A, (__v4si)__B, (__v2di)__C); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_macclo_epi32(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i)__builtin_ia32_vpmacsdql((__v4si)__A, (__v4si)__B, (__v2di)__C); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_maccshi_epi32(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i)__builtin_ia32_vpmacssdqh((__v4si)__A, (__v4si)__B, (__v2di)__C); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_macchi_epi32(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i)__builtin_ia32_vpmacsdqh((__v4si)__A, (__v4si)__B, (__v2di)__C); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_maddsd_epi16(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i)__builtin_ia32_vpmadcsswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_maddd_epi16(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i)__builtin_ia32_vpmadcswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_haddw_epi8(__m128i __A) +{ + return (__m128i)__builtin_ia32_vphaddbw((__v16qi)__A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_haddd_epi8(__m128i __A) +{ + return (__m128i)__builtin_ia32_vphaddbd((__v16qi)__A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_haddq_epi8(__m128i __A) +{ + return (__m128i)__builtin_ia32_vphaddbq((__v16qi)__A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_haddd_epi16(__m128i __A) +{ + return (__m128i)__builtin_ia32_vphaddwd((__v8hi)__A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_haddq_epi16(__m128i __A) +{ + return (__m128i)__builtin_ia32_vphaddwq((__v8hi)__A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_haddq_epi32(__m128i __A) +{ + return (__m128i)__builtin_ia32_vphadddq((__v4si)__A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_haddw_epu8(__m128i __A) +{ + return (__m128i)__builtin_ia32_vphaddubw((__v16qi)__A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_haddd_epu8(__m128i __A) +{ + return (__m128i)__builtin_ia32_vphaddubd((__v16qi)__A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_haddq_epu8(__m128i __A) +{ + return (__m128i)__builtin_ia32_vphaddubq((__v16qi)__A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_haddd_epu16(__m128i __A) +{ + return (__m128i)__builtin_ia32_vphadduwd((__v8hi)__A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_haddq_epu16(__m128i __A) +{ + return (__m128i)__builtin_ia32_vphadduwq((__v8hi)__A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_haddq_epu32(__m128i __A) +{ + return (__m128i)__builtin_ia32_vphaddudq((__v4si)__A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_hsubw_epi8(__m128i __A) +{ + return (__m128i)__builtin_ia32_vphsubbw((__v16qi)__A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_hsubd_epi16(__m128i __A) +{ + return (__m128i)__builtin_ia32_vphsubwd((__v8hi)__A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_hsubq_epi32(__m128i __A) +{ + return (__m128i)__builtin_ia32_vphsubdq((__v4si)__A); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_cmov_si128(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i)(((__v2du)__A & (__v2du)__C) | ((__v2du)__B & ~(__v2du)__C)); +} + +static __inline__ __m256i __DEFAULT_FN_ATTRS256 +_mm256_cmov_si256(__m256i __A, __m256i __B, __m256i __C) +{ + return (__m256i)(((__v4du)__A & (__v4du)__C) | ((__v4du)__B & ~(__v4du)__C)); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_perm_epi8(__m128i __A, __m128i __B, __m128i __C) +{ + return (__m128i)__builtin_ia32_vpperm((__v16qi)__A, (__v16qi)__B, (__v16qi)__C); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_rot_epi8(__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_vprotb((__v16qi)__A, (__v16qi)__B); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_rot_epi16(__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_vprotw((__v8hi)__A, (__v8hi)__B); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_rot_epi32(__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_vprotd((__v4si)__A, (__v4si)__B); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_rot_epi64(__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_vprotq((__v2di)__A, (__v2di)__B); +} + +#define _mm_roti_epi8(A, N) \ + ((__m128i)__builtin_ia32_vprotbi((__v16qi)(__m128i)(A), (N))) + +#define _mm_roti_epi16(A, N) \ + ((__m128i)__builtin_ia32_vprotwi((__v8hi)(__m128i)(A), (N))) + +#define _mm_roti_epi32(A, N) \ + ((__m128i)__builtin_ia32_vprotdi((__v4si)(__m128i)(A), (N))) + +#define _mm_roti_epi64(A, N) \ + ((__m128i)__builtin_ia32_vprotqi((__v2di)(__m128i)(A), (N))) + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_shl_epi8(__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_vpshlb((__v16qi)__A, (__v16qi)__B); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_shl_epi16(__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_vpshlw((__v8hi)__A, (__v8hi)__B); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_shl_epi32(__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_vpshld((__v4si)__A, (__v4si)__B); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_shl_epi64(__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_vpshlq((__v2di)__A, (__v2di)__B); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_sha_epi8(__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_vpshab((__v16qi)__A, (__v16qi)__B); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_sha_epi16(__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_vpshaw((__v8hi)__A, (__v8hi)__B); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_sha_epi32(__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_vpshad((__v4si)__A, (__v4si)__B); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_sha_epi64(__m128i __A, __m128i __B) +{ + return (__m128i)__builtin_ia32_vpshaq((__v2di)__A, (__v2di)__B); +} + +#define _mm_com_epu8(A, B, N) \ + ((__m128i)__builtin_ia32_vpcomub((__v16qi)(__m128i)(A), \ + (__v16qi)(__m128i)(B), (N))) + +#define _mm_com_epu16(A, B, N) \ + ((__m128i)__builtin_ia32_vpcomuw((__v8hi)(__m128i)(A), \ + (__v8hi)(__m128i)(B), (N))) + +#define _mm_com_epu32(A, B, N) \ + ((__m128i)__builtin_ia32_vpcomud((__v4si)(__m128i)(A), \ + (__v4si)(__m128i)(B), (N))) + +#define _mm_com_epu64(A, B, N) \ + ((__m128i)__builtin_ia32_vpcomuq((__v2di)(__m128i)(A), \ + (__v2di)(__m128i)(B), (N))) + +#define _mm_com_epi8(A, B, N) \ + ((__m128i)__builtin_ia32_vpcomb((__v16qi)(__m128i)(A), \ + (__v16qi)(__m128i)(B), (N))) + +#define _mm_com_epi16(A, B, N) \ + ((__m128i)__builtin_ia32_vpcomw((__v8hi)(__m128i)(A), \ + (__v8hi)(__m128i)(B), (N))) + +#define _mm_com_epi32(A, B, N) \ + ((__m128i)__builtin_ia32_vpcomd((__v4si)(__m128i)(A), \ + (__v4si)(__m128i)(B), (N))) + +#define _mm_com_epi64(A, B, N) \ + ((__m128i)__builtin_ia32_vpcomq((__v2di)(__m128i)(A), \ + (__v2di)(__m128i)(B), (N))) + +#define _MM_PCOMCTRL_LT 0 +#define _MM_PCOMCTRL_LE 1 +#define _MM_PCOMCTRL_GT 2 +#define _MM_PCOMCTRL_GE 3 +#define _MM_PCOMCTRL_EQ 4 +#define _MM_PCOMCTRL_NEQ 5 +#define _MM_PCOMCTRL_FALSE 6 +#define _MM_PCOMCTRL_TRUE 7 + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comlt_epu8(__m128i __A, __m128i __B) +{ + return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_LT); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comle_epu8(__m128i __A, __m128i __B) +{ + return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_LE); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comgt_epu8(__m128i __A, __m128i __B) +{ + return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_GT); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comge_epu8(__m128i __A, __m128i __B) +{ + return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_GE); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comeq_epu8(__m128i __A, __m128i __B) +{ + return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_EQ); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comneq_epu8(__m128i __A, __m128i __B) +{ + return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_NEQ); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comfalse_epu8(__m128i __A, __m128i __B) +{ + return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_FALSE); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comtrue_epu8(__m128i __A, __m128i __B) +{ + return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_TRUE); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comlt_epu16(__m128i __A, __m128i __B) +{ + return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_LT); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comle_epu16(__m128i __A, __m128i __B) +{ + return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_LE); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comgt_epu16(__m128i __A, __m128i __B) +{ + return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_GT); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comge_epu16(__m128i __A, __m128i __B) +{ + return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_GE); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comeq_epu16(__m128i __A, __m128i __B) +{ + return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_EQ); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comneq_epu16(__m128i __A, __m128i __B) +{ + return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_NEQ); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comfalse_epu16(__m128i __A, __m128i __B) +{ + return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_FALSE); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comtrue_epu16(__m128i __A, __m128i __B) +{ + return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_TRUE); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comlt_epu32(__m128i __A, __m128i __B) +{ + return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_LT); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comle_epu32(__m128i __A, __m128i __B) +{ + return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_LE); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comgt_epu32(__m128i __A, __m128i __B) +{ + return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_GT); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comge_epu32(__m128i __A, __m128i __B) +{ + return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_GE); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comeq_epu32(__m128i __A, __m128i __B) +{ + return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_EQ); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comneq_epu32(__m128i __A, __m128i __B) +{ + return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_NEQ); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comfalse_epu32(__m128i __A, __m128i __B) +{ + return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_FALSE); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comtrue_epu32(__m128i __A, __m128i __B) +{ + return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_TRUE); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comlt_epu64(__m128i __A, __m128i __B) +{ + return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_LT); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comle_epu64(__m128i __A, __m128i __B) +{ + return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_LE); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comgt_epu64(__m128i __A, __m128i __B) +{ + return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_GT); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comge_epu64(__m128i __A, __m128i __B) +{ + return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_GE); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comeq_epu64(__m128i __A, __m128i __B) +{ + return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_EQ); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comneq_epu64(__m128i __A, __m128i __B) +{ + return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_NEQ); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comfalse_epu64(__m128i __A, __m128i __B) +{ + return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_FALSE); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comtrue_epu64(__m128i __A, __m128i __B) +{ + return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_TRUE); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comlt_epi8(__m128i __A, __m128i __B) +{ + return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_LT); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comle_epi8(__m128i __A, __m128i __B) +{ + return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_LE); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comgt_epi8(__m128i __A, __m128i __B) +{ + return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_GT); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comge_epi8(__m128i __A, __m128i __B) +{ + return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_GE); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comeq_epi8(__m128i __A, __m128i __B) +{ + return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_EQ); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comneq_epi8(__m128i __A, __m128i __B) +{ + return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_NEQ); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comfalse_epi8(__m128i __A, __m128i __B) +{ + return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_FALSE); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comtrue_epi8(__m128i __A, __m128i __B) +{ + return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_TRUE); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comlt_epi16(__m128i __A, __m128i __B) +{ + return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_LT); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comle_epi16(__m128i __A, __m128i __B) +{ + return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_LE); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comgt_epi16(__m128i __A, __m128i __B) +{ + return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_GT); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comge_epi16(__m128i __A, __m128i __B) +{ + return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_GE); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comeq_epi16(__m128i __A, __m128i __B) +{ + return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_EQ); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comneq_epi16(__m128i __A, __m128i __B) +{ + return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_NEQ); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comfalse_epi16(__m128i __A, __m128i __B) +{ + return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_FALSE); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comtrue_epi16(__m128i __A, __m128i __B) +{ + return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_TRUE); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comlt_epi32(__m128i __A, __m128i __B) +{ + return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_LT); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comle_epi32(__m128i __A, __m128i __B) +{ + return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_LE); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comgt_epi32(__m128i __A, __m128i __B) +{ + return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_GT); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comge_epi32(__m128i __A, __m128i __B) +{ + return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_GE); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comeq_epi32(__m128i __A, __m128i __B) +{ + return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_EQ); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comneq_epi32(__m128i __A, __m128i __B) +{ + return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_NEQ); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comfalse_epi32(__m128i __A, __m128i __B) +{ + return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_FALSE); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comtrue_epi32(__m128i __A, __m128i __B) +{ + return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_TRUE); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comlt_epi64(__m128i __A, __m128i __B) +{ + return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_LT); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comle_epi64(__m128i __A, __m128i __B) +{ + return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_LE); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comgt_epi64(__m128i __A, __m128i __B) +{ + return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_GT); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comge_epi64(__m128i __A, __m128i __B) +{ + return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_GE); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comeq_epi64(__m128i __A, __m128i __B) +{ + return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_EQ); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comneq_epi64(__m128i __A, __m128i __B) +{ + return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_NEQ); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comfalse_epi64(__m128i __A, __m128i __B) +{ + return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_FALSE); +} + +static __inline__ __m128i __DEFAULT_FN_ATTRS +_mm_comtrue_epi64(__m128i __A, __m128i __B) +{ + return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_TRUE); +} + +#define _mm_permute2_pd(X, Y, C, I) \ + ((__m128d)__builtin_ia32_vpermil2pd((__v2df)(__m128d)(X), \ + (__v2df)(__m128d)(Y), \ + (__v2di)(__m128i)(C), (I))) + +#define _mm256_permute2_pd(X, Y, C, I) \ + ((__m256d)__builtin_ia32_vpermil2pd256((__v4df)(__m256d)(X), \ + (__v4df)(__m256d)(Y), \ + (__v4di)(__m256i)(C), (I))) + +#define _mm_permute2_ps(X, Y, C, I) \ + ((__m128)__builtin_ia32_vpermil2ps((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), \ + (__v4si)(__m128i)(C), (I))) + +#define _mm256_permute2_ps(X, Y, C, I) \ + ((__m256)__builtin_ia32_vpermil2ps256((__v8sf)(__m256)(X), \ + (__v8sf)(__m256)(Y), \ + (__v8si)(__m256i)(C), (I))) + +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_frcz_ss(__m128 __A) +{ + return (__m128)__builtin_ia32_vfrczss((__v4sf)__A); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_frcz_sd(__m128d __A) +{ + return (__m128d)__builtin_ia32_vfrczsd((__v2df)__A); +} + +static __inline__ __m128 __DEFAULT_FN_ATTRS +_mm_frcz_ps(__m128 __A) +{ + return (__m128)__builtin_ia32_vfrczps((__v4sf)__A); +} + +static __inline__ __m128d __DEFAULT_FN_ATTRS +_mm_frcz_pd(__m128d __A) +{ + return (__m128d)__builtin_ia32_vfrczpd((__v2df)__A); +} + +static __inline__ __m256 __DEFAULT_FN_ATTRS256 +_mm256_frcz_ps(__m256 __A) +{ + return (__m256)__builtin_ia32_vfrczps256((__v8sf)__A); +} + +static __inline__ __m256d __DEFAULT_FN_ATTRS256 +_mm256_frcz_pd(__m256d __A) +{ + return (__m256d)__builtin_ia32_vfrczpd256((__v4df)__A); +} + +#undef __DEFAULT_FN_ATTRS +#undef __DEFAULT_FN_ATTRS256 + +#endif /* __XOPINTRIN_H */ diff --git a/include-llvm/xsavecintrin.h b/include-llvm/xsavecintrin.h new file mode 100644 index 0000000..5524947 --- /dev/null +++ b/include-llvm/xsavecintrin.h @@ -0,0 +1,34 @@ +/*===---- xsavecintrin.h - XSAVEC intrinsic --------------------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __XSAVECINTRIN_H +#define __XSAVECINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("xsavec"))) + +static __inline__ void __DEFAULT_FN_ATTRS +_xsavec(void *__p, unsigned long long __m) { + __builtin_ia32_xsavec(__p, __m); +} + +#ifdef __x86_64__ +static __inline__ void __DEFAULT_FN_ATTRS +_xsavec64(void *__p, unsigned long long __m) { + __builtin_ia32_xsavec64(__p, __m); +} +#endif + +#undef __DEFAULT_FN_ATTRS + +#endif diff --git a/include-llvm/xsaveintrin.h b/include-llvm/xsaveintrin.h new file mode 100644 index 0000000..9429db6 --- /dev/null +++ b/include-llvm/xsaveintrin.h @@ -0,0 +1,63 @@ +/*===---- xsaveintrin.h - XSAVE intrinsic ----------------------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __XSAVEINTRIN_H +#define __XSAVEINTRIN_H + +#ifdef _MSC_VER +#define _XCR_XFEATURE_ENABLED_MASK 0 +#endif + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("xsave"))) + +static __inline__ void __DEFAULT_FN_ATTRS +_xsave(void *__p, unsigned long long __m) { + __builtin_ia32_xsave(__p, __m); +} + +static __inline__ void __DEFAULT_FN_ATTRS +_xrstor(void *__p, unsigned long long __m) { + __builtin_ia32_xrstor(__p, __m); +} + +#ifndef _MSC_VER +#define _xgetbv(A) __builtin_ia32_xgetbv((long long)(A)) +#define _xsetbv(A, B) __builtin_ia32_xsetbv((unsigned int)(A), (unsigned long long)(B)) +#else +#ifdef __cplusplus +extern "C" { +#endif +unsigned __int64 __cdecl _xgetbv(unsigned int); +void __cdecl _xsetbv(unsigned int, unsigned __int64); +#ifdef __cplusplus +} +#endif +#endif /* _MSC_VER */ + +#ifdef __x86_64__ +static __inline__ void __DEFAULT_FN_ATTRS +_xsave64(void *__p, unsigned long long __m) { + __builtin_ia32_xsave64(__p, __m); +} + +static __inline__ void __DEFAULT_FN_ATTRS +_xrstor64(void *__p, unsigned long long __m) { + __builtin_ia32_xrstor64(__p, __m); +} + +#endif + +#undef __DEFAULT_FN_ATTRS + +#endif diff --git a/include-llvm/xsaveoptintrin.h b/include-llvm/xsaveoptintrin.h new file mode 100644 index 0000000..89a4c44 --- /dev/null +++ b/include-llvm/xsaveoptintrin.h @@ -0,0 +1,34 @@ +/*===---- xsaveoptintrin.h - XSAVEOPT intrinsic ----------------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __XSAVEOPTINTRIN_H +#define __XSAVEOPTINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("xsaveopt"))) + +static __inline__ void __DEFAULT_FN_ATTRS +_xsaveopt(void *__p, unsigned long long __m) { + __builtin_ia32_xsaveopt(__p, __m); +} + +#ifdef __x86_64__ +static __inline__ void __DEFAULT_FN_ATTRS +_xsaveopt64(void *__p, unsigned long long __m) { + __builtin_ia32_xsaveopt64(__p, __m); +} +#endif + +#undef __DEFAULT_FN_ATTRS + +#endif diff --git a/include-llvm/xsavesintrin.h b/include-llvm/xsavesintrin.h new file mode 100644 index 0000000..3f99219 --- /dev/null +++ b/include-llvm/xsavesintrin.h @@ -0,0 +1,44 @@ +/*===---- xsavesintrin.h - XSAVES intrinsic --------------------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __XSAVESINTRIN_H +#define __XSAVESINTRIN_H + +/* Define the default attributes for the functions in this file. */ +#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("xsaves"))) + +static __inline__ void __DEFAULT_FN_ATTRS +_xsaves(void *__p, unsigned long long __m) { + __builtin_ia32_xsaves(__p, __m); +} + +static __inline__ void __DEFAULT_FN_ATTRS +_xrstors(void *__p, unsigned long long __m) { + __builtin_ia32_xrstors(__p, __m); +} + +#ifdef __x86_64__ +static __inline__ void __DEFAULT_FN_ATTRS +_xrstors64(void *__p, unsigned long long __m) { + __builtin_ia32_xrstors64(__p, __m); +} + +static __inline__ void __DEFAULT_FN_ATTRS +_xsaves64(void *__p, unsigned long long __m) { + __builtin_ia32_xsaves64(__p, __m); +} +#endif + +#undef __DEFAULT_FN_ATTRS + +#endif diff --git a/include-llvm/xtestintrin.h b/include-llvm/xtestintrin.h new file mode 100644 index 0000000..7d19e37 --- /dev/null +++ b/include-llvm/xtestintrin.h @@ -0,0 +1,27 @@ +/*===---- xtestintrin.h - XTEST intrinsic ----------------------------------=== + * + * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. + * See https://llvm.org/LICENSE.txt for license information. + * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + * + *===-----------------------------------------------------------------------=== + */ + +#ifndef __IMMINTRIN_H +#error "Never use directly; include instead." +#endif + +#ifndef __XTESTINTRIN_H +#define __XTESTINTRIN_H + +/* xtest returns non-zero if the instruction is executed within an RTM or active + * HLE region. */ +/* FIXME: This can be an either or for RTM/HLE. Deal with this when HLE is + * supported. */ +static __inline__ int + __attribute__((__always_inline__, __nodebug__, __target__("rtm"))) + _xtest(void) { + return __builtin_ia32_xtest(); +} + +#endif diff --git a/include/__wmmintrin_aes.h b/include/__wmmintrin_aes.h deleted file mode 100644 index 3010b38..0000000 --- a/include/__wmmintrin_aes.h +++ /dev/null @@ -1,140 +0,0 @@ -/*===---- __wmmintrin_aes.h - AES intrinsics -------------------------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ - -#ifndef __WMMINTRIN_H -#error "Never use <__wmmintrin_aes.h> directly; include instead." -#endif - -#ifndef __WMMINTRIN_AES_H -#define __WMMINTRIN_AES_H - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("aes"), __min_vector_width__(128))) - -/// Performs a single round of AES encryption using the Equivalent -/// Inverse Cipher, transforming the state value from the first source -/// operand using a 128-bit round key value contained in the second source -/// operand, and writes the result to the destination. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VAESENC instruction. -/// -/// \param __V -/// A 128-bit integer vector containing the state value. -/// \param __R -/// A 128-bit integer vector containing the round key value. -/// \returns A 128-bit integer vector containing the encrypted value. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_aesenc_si128(__m128i __V, __m128i __R) -{ - return (__m128i)__builtin_ia32_aesenc128((__v2di)__V, (__v2di)__R); -} - -/// Performs the final round of AES encryption using the Equivalent -/// Inverse Cipher, transforming the state value from the first source -/// operand using a 128-bit round key value contained in the second source -/// operand, and writes the result to the destination. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VAESENCLAST instruction. -/// -/// \param __V -/// A 128-bit integer vector containing the state value. -/// \param __R -/// A 128-bit integer vector containing the round key value. -/// \returns A 128-bit integer vector containing the encrypted value. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_aesenclast_si128(__m128i __V, __m128i __R) -{ - return (__m128i)__builtin_ia32_aesenclast128((__v2di)__V, (__v2di)__R); -} - -/// Performs a single round of AES decryption using the Equivalent -/// Inverse Cipher, transforming the state value from the first source -/// operand using a 128-bit round key value contained in the second source -/// operand, and writes the result to the destination. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VAESDEC instruction. -/// -/// \param __V -/// A 128-bit integer vector containing the state value. -/// \param __R -/// A 128-bit integer vector containing the round key value. -/// \returns A 128-bit integer vector containing the decrypted value. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_aesdec_si128(__m128i __V, __m128i __R) -{ - return (__m128i)__builtin_ia32_aesdec128((__v2di)__V, (__v2di)__R); -} - -/// Performs the final round of AES decryption using the Equivalent -/// Inverse Cipher, transforming the state value from the first source -/// operand using a 128-bit round key value contained in the second source -/// operand, and writes the result to the destination. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VAESDECLAST instruction. -/// -/// \param __V -/// A 128-bit integer vector containing the state value. -/// \param __R -/// A 128-bit integer vector containing the round key value. -/// \returns A 128-bit integer vector containing the decrypted value. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_aesdeclast_si128(__m128i __V, __m128i __R) -{ - return (__m128i)__builtin_ia32_aesdeclast128((__v2di)__V, (__v2di)__R); -} - -/// Applies the AES InvMixColumns() transformation to an expanded key -/// contained in the source operand, and writes the result to the -/// destination. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VAESIMC instruction. -/// -/// \param __V -/// A 128-bit integer vector containing the expanded key. -/// \returns A 128-bit integer vector containing the transformed value. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_aesimc_si128(__m128i __V) -{ - return (__m128i)__builtin_ia32_aesimc128((__v2di)__V); -} - -/// Generates a round key for AES encryption, operating on 128-bit data -/// specified in the first source operand and using an 8-bit round constant -/// specified by the second source operand, and writes the result to the -/// destination. -/// -/// \headerfile -/// -/// \code -/// __m128i _mm_aeskeygenassist_si128(__m128i C, const int R); -/// \endcode -/// -/// This intrinsic corresponds to the AESKEYGENASSIST instruction. -/// -/// \param C -/// A 128-bit integer vector that is used to generate the AES encryption key. -/// \param R -/// An 8-bit round constant used to generate the AES encryption key. -/// \returns A 128-bit round key for AES encryption. -#define _mm_aeskeygenassist_si128(C, R) \ - ((__m128i)__builtin_ia32_aeskeygenassist128((__v2di)(__m128i)(C), (int)(R))) - -#undef __DEFAULT_FN_ATTRS - -#endif /* __WMMINTRIN_AES_H */ diff --git a/include/__wmmintrin_pclmul.h b/include/__wmmintrin_pclmul.h deleted file mode 100644 index fef4b93..0000000 --- a/include/__wmmintrin_pclmul.h +++ /dev/null @@ -1,48 +0,0 @@ -/*===---- __wmmintrin_pclmul.h - PCMUL intrinsics ---------------------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ - -#ifndef __WMMINTRIN_H -#error "Never use <__wmmintrin_pclmul.h> directly; include instead." -#endif - -#ifndef __WMMINTRIN_PCLMUL_H -#define __WMMINTRIN_PCLMUL_H - -/// Multiplies two 64-bit integer values, which are selected from source -/// operands using the immediate-value operand. The multiplication is a -/// carry-less multiplication, and the 128-bit integer product is stored in -/// the destination. -/// -/// \headerfile -/// -/// \code -/// __m128i _mm_clmulepi64_si128(__m128i __X, __m128i __Y, const int __I); -/// \endcode -/// -/// This intrinsic corresponds to the VPCLMULQDQ instruction. -/// -/// \param __X -/// A 128-bit vector of [2 x i64] containing one of the source operands. -/// \param __Y -/// A 128-bit vector of [2 x i64] containing one of the source operands. -/// \param __I -/// An immediate value specifying which 64-bit values to select from the -/// operands. Bit 0 is used to select a value from operand \a __X, and bit -/// 4 is used to select a value from operand \a __Y: \n -/// Bit[0]=0 indicates that bits[63:0] of operand \a __X are used. \n -/// Bit[0]=1 indicates that bits[127:64] of operand \a __X are used. \n -/// Bit[4]=0 indicates that bits[63:0] of operand \a __Y are used. \n -/// Bit[4]=1 indicates that bits[127:64] of operand \a __Y are used. -/// \returns The 128-bit integer vector containing the result of the carry-less -/// multiplication of the selected 64-bit values. -#define _mm_clmulepi64_si128(X, Y, I) \ - ((__m128i)__builtin_ia32_pclmulqdq128((__v2di)(__m128i)(X), \ - (__v2di)(__m128i)(Y), (char)(I))) - -#endif /* __WMMINTRIN_PCLMUL_H */ diff --git a/include/adxintrin.h b/include/adxintrin.h deleted file mode 100644 index 72b9ed0..0000000 --- a/include/adxintrin.h +++ /dev/null @@ -1,72 +0,0 @@ -/*===---- adxintrin.h - ADX intrinsics -------------------------------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ - -#ifndef __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __ADXINTRIN_H -#define __ADXINTRIN_H - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__)) - -/* Intrinsics that are available only if __ADX__ defined */ -static __inline unsigned char __attribute__((__always_inline__, __nodebug__, __target__("adx"))) -_addcarryx_u32(unsigned char __cf, unsigned int __x, unsigned int __y, - unsigned int *__p) -{ - return __builtin_ia32_addcarryx_u32(__cf, __x, __y, __p); -} - -#ifdef __x86_64__ -static __inline unsigned char __attribute__((__always_inline__, __nodebug__, __target__("adx"))) -_addcarryx_u64(unsigned char __cf, unsigned long long __x, - unsigned long long __y, unsigned long long *__p) -{ - return __builtin_ia32_addcarryx_u64(__cf, __x, __y, __p); -} -#endif - -/* Intrinsics that are also available if __ADX__ undefined */ -static __inline unsigned char __DEFAULT_FN_ATTRS -_addcarry_u32(unsigned char __cf, unsigned int __x, unsigned int __y, - unsigned int *__p) -{ - return __builtin_ia32_addcarryx_u32(__cf, __x, __y, __p); -} - -#ifdef __x86_64__ -static __inline unsigned char __DEFAULT_FN_ATTRS -_addcarry_u64(unsigned char __cf, unsigned long long __x, - unsigned long long __y, unsigned long long *__p) -{ - return __builtin_ia32_addcarryx_u64(__cf, __x, __y, __p); -} -#endif - -static __inline unsigned char __DEFAULT_FN_ATTRS -_subborrow_u32(unsigned char __cf, unsigned int __x, unsigned int __y, - unsigned int *__p) -{ - return __builtin_ia32_subborrow_u32(__cf, __x, __y, __p); -} - -#ifdef __x86_64__ -static __inline unsigned char __DEFAULT_FN_ATTRS -_subborrow_u64(unsigned char __cf, unsigned long long __x, - unsigned long long __y, unsigned long long *__p) -{ - return __builtin_ia32_subborrow_u64(__cf, __x, __y, __p); -} -#endif - -#undef __DEFAULT_FN_ATTRS - -#endif /* __ADXINTRIN_H */ diff --git a/include/ammintrin.h b/include/ammintrin.h deleted file mode 100644 index 1af2096..0000000 --- a/include/ammintrin.h +++ /dev/null @@ -1,183 +0,0 @@ -/*===---- ammintrin.h - SSE4a intrinsics -----------------------------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ - -#ifndef __AMMINTRIN_H -#define __AMMINTRIN_H - -#if !defined(__i386__) && !defined(__x86_64__) -#error "This header is only meant to be used on x86 and x64 architecture" -#endif - -#include - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4a"), __min_vector_width__(128))) - -/// Extracts the specified bits from the lower 64 bits of the 128-bit -/// integer vector operand at the index \a idx and of the length \a len. -/// -/// \headerfile -/// -/// \code -/// __m128i _mm_extracti_si64(__m128i x, const int len, const int idx); -/// \endcode -/// -/// This intrinsic corresponds to the EXTRQ instruction. -/// -/// \param x -/// The value from which bits are extracted. -/// \param len -/// Bits [5:0] specify the length; the other bits are ignored. If bits [5:0] -/// are zero, the length is interpreted as 64. -/// \param idx -/// Bits [5:0] specify the index of the least significant bit; the other -/// bits are ignored. If the sum of the index and length is greater than 64, -/// the result is undefined. If the length and index are both zero, bits -/// [63:0] of parameter \a x are extracted. If the length is zero but the -/// index is non-zero, the result is undefined. -/// \returns A 128-bit integer vector whose lower 64 bits contain the bits -/// extracted from the source operand. -#define _mm_extracti_si64(x, len, idx) \ - ((__m128i)__builtin_ia32_extrqi((__v2di)(__m128i)(x), \ - (char)(len), (char)(idx))) - -/// Extracts the specified bits from the lower 64 bits of the 128-bit -/// integer vector operand at the index and of the length specified by -/// \a __y. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the EXTRQ instruction. -/// -/// \param __x -/// The value from which bits are extracted. -/// \param __y -/// Specifies the index of the least significant bit at [13:8] and the -/// length at [5:0]; all other bits are ignored. If bits [5:0] are zero, the -/// length is interpreted as 64. If the sum of the index and length is -/// greater than 64, the result is undefined. If the length and index are -/// both zero, bits [63:0] of parameter \a __x are extracted. If the length -/// is zero but the index is non-zero, the result is undefined. -/// \returns A 128-bit vector whose lower 64 bits contain the bits extracted -/// from the source operand. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_extract_si64(__m128i __x, __m128i __y) -{ - return (__m128i)__builtin_ia32_extrq((__v2di)__x, (__v16qi)__y); -} - -/// Inserts bits of a specified length from the source integer vector -/// \a y into the lower 64 bits of the destination integer vector \a x at -/// the index \a idx and of the length \a len. -/// -/// \headerfile -/// -/// \code -/// __m128i _mm_inserti_si64(__m128i x, __m128i y, const int len, -/// const int idx); -/// \endcode -/// -/// This intrinsic corresponds to the INSERTQ instruction. -/// -/// \param x -/// The destination operand where bits will be inserted. The inserted bits -/// are defined by the length \a len and by the index \a idx specifying the -/// least significant bit. -/// \param y -/// The source operand containing the bits to be extracted. The extracted -/// bits are the least significant bits of operand \a y of length \a len. -/// \param len -/// Bits [5:0] specify the length; the other bits are ignored. If bits [5:0] -/// are zero, the length is interpreted as 64. -/// \param idx -/// Bits [5:0] specify the index of the least significant bit; the other -/// bits are ignored. If the sum of the index and length is greater than 64, -/// the result is undefined. If the length and index are both zero, bits -/// [63:0] of parameter \a y are inserted into parameter \a x. If the length -/// is zero but the index is non-zero, the result is undefined. -/// \returns A 128-bit integer vector containing the original lower 64-bits of -/// destination operand \a x with the specified bitfields replaced by the -/// lower bits of source operand \a y. The upper 64 bits of the return value -/// are undefined. -#define _mm_inserti_si64(x, y, len, idx) \ - ((__m128i)__builtin_ia32_insertqi((__v2di)(__m128i)(x), \ - (__v2di)(__m128i)(y), \ - (char)(len), (char)(idx))) - -/// Inserts bits of a specified length from the source integer vector -/// \a __y into the lower 64 bits of the destination integer vector \a __x -/// at the index and of the length specified by \a __y. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the INSERTQ instruction. -/// -/// \param __x -/// The destination operand where bits will be inserted. The inserted bits -/// are defined by the length and by the index of the least significant bit -/// specified by operand \a __y. -/// \param __y -/// The source operand containing the bits to be extracted. The extracted -/// bits are the least significant bits of operand \a __y with length -/// specified by bits [69:64]. These are inserted into the destination at the -/// index specified by bits [77:72]; all other bits are ignored. If bits -/// [69:64] are zero, the length is interpreted as 64. If the sum of the -/// index and length is greater than 64, the result is undefined. If the -/// length and index are both zero, bits [63:0] of parameter \a __y are -/// inserted into parameter \a __x. If the length is zero but the index is -/// non-zero, the result is undefined. -/// \returns A 128-bit integer vector containing the original lower 64-bits of -/// destination operand \a __x with the specified bitfields replaced by the -/// lower bits of source operand \a __y. The upper 64 bits of the return -/// value are undefined. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_insert_si64(__m128i __x, __m128i __y) -{ - return (__m128i)__builtin_ia32_insertq((__v2di)__x, (__v2di)__y); -} - -/// Stores a 64-bit double-precision value in a 64-bit memory location. -/// To minimize caching, the data is flagged as non-temporal (unlikely to be -/// used again soon). -/// -/// \headerfile -/// -/// This intrinsic corresponds to the MOVNTSD instruction. -/// -/// \param __p -/// The 64-bit memory location used to store the register value. -/// \param __a -/// The 64-bit double-precision floating-point register value to be stored. -static __inline__ void __DEFAULT_FN_ATTRS -_mm_stream_sd(double *__p, __m128d __a) -{ - __builtin_ia32_movntsd(__p, (__v2df)__a); -} - -/// Stores a 32-bit single-precision floating-point value in a 32-bit -/// memory location. To minimize caching, the data is flagged as -/// non-temporal (unlikely to be used again soon). -/// -/// \headerfile -/// -/// This intrinsic corresponds to the MOVNTSS instruction. -/// -/// \param __p -/// The 32-bit memory location used to store the register value. -/// \param __a -/// The 32-bit single-precision floating-point register value to be stored. -static __inline__ void __DEFAULT_FN_ATTRS -_mm_stream_ss(float *__p, __m128 __a) -{ - __builtin_ia32_movntss(__p, (__v4sf)__a); -} - -#undef __DEFAULT_FN_ATTRS - -#endif /* __AMMINTRIN_H */ diff --git a/include/amxintrin.h b/include/amxintrin.h deleted file mode 100644 index 4940666..0000000 --- a/include/amxintrin.h +++ /dev/null @@ -1,494 +0,0 @@ -/*===--------------- amxintrin.h - AMX intrinsics -*- C/C++ -*---------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===------------------------------------------------------------------------=== - */ - -#ifndef __IMMINTRIN_H -#error "Never use directly; include instead." -#endif /* __IMMINTRIN_H */ - -#ifndef __AMXINTRIN_H -#define __AMXINTRIN_H -#ifdef __x86_64__ - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS_TILE \ - __attribute__((__always_inline__, __nodebug__, __target__("amx-tile"))) -#define __DEFAULT_FN_ATTRS_INT8 \ - __attribute__((__always_inline__, __nodebug__, __target__("amx-int8"))) -#define __DEFAULT_FN_ATTRS_BF16 \ - __attribute__((__always_inline__, __nodebug__, __target__("amx-bf16"))) - -/// Load tile configuration from a 64-byte memory location specified by -/// "mem_addr". The tile configuration includes the tile type palette, the -/// number of bytes per row, and the number of rows. If the specified -/// palette_id is zero, that signifies the init state for both the tile -/// config and the tile data, and the tiles are zeroed. Any invalid -/// configurations will result in #GP fault. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the LDTILECFG instruction. -/// -/// \param __config -/// A pointer to 512-bits configuration -static __inline__ void __DEFAULT_FN_ATTRS_TILE -_tile_loadconfig(const void *__config) { - __builtin_ia32_tile_loadconfig(__config); -} - -/// Stores the current tile configuration to a 64-byte memory location -/// specified by "mem_addr". The tile configuration includes the tile type -/// palette, the number of bytes per row, and the number of rows. If tiles -/// are not configured, all zeroes will be stored to memory. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the STTILECFG instruction. -/// -/// \param __config -/// A pointer to 512-bits configuration -static __inline__ void __DEFAULT_FN_ATTRS_TILE -_tile_storeconfig(void *__config) { - __builtin_ia32_tile_storeconfig(__config); -} - -/// Release the tile configuration to return to the init state, which -/// releases all storage it currently holds. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the TILERELEASE instruction. -static __inline__ void __DEFAULT_FN_ATTRS_TILE _tile_release(void) { - __builtin_ia32_tilerelease(); -} - -/// Load tile rows from memory specifieid by "base" address and "stride" into -/// destination tile "dst" using the tile configuration previously configured -/// via "_tile_loadconfig". -/// -/// \headerfile -/// -/// This intrinsic corresponds to the TILELOADD instruction. -/// -/// \param dst -/// A destination tile. Max size is 1024 Bytes. -/// \param base -/// A pointer to base address. -/// \param stride -/// The stride between the rows' data to be loaded in memory. -#define _tile_loadd(dst, base, stride) \ - __builtin_ia32_tileloadd64((dst), ((const void *)(base)), \ - (__SIZE_TYPE__)(stride)) - -/// Load tile rows from memory specifieid by "base" address and "stride" into -/// destination tile "dst" using the tile configuration previously configured -/// via "_tile_loadconfig". This intrinsic provides a hint to the implementation -/// that the data will likely not be reused in the near future and the data -/// caching can be optimized accordingly. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the TILELOADDT1 instruction. -/// -/// \param dst -/// A destination tile. Max size is 1024 Bytes. -/// \param base -/// A pointer to base address. -/// \param stride -/// The stride between the rows' data to be loaded in memory. -#define _tile_stream_loadd(dst, base, stride) \ - __builtin_ia32_tileloaddt164((dst), ((const void *)(base)), \ - (__SIZE_TYPE__)(stride)) - -/// Store the tile specified by "src" to memory specifieid by "base" address and -/// "stride" using the tile configuration previously configured via -/// "_tile_loadconfig". -/// -/// \headerfile -/// -/// This intrinsic corresponds to the TILESTORED instruction. -/// -/// \param dst -/// A destination tile. Max size is 1024 Bytes. -/// \param base -/// A pointer to base address. -/// \param stride -/// The stride between the rows' data to be stored in memory. -#define _tile_stored(dst, base, stride) \ - __builtin_ia32_tilestored64((dst), ((void *)(base)), (__SIZE_TYPE__)(stride)) - -/// Zero the tile specified by "tdest". -/// -/// \headerfile -/// -/// This intrinsic corresponds to the TILEZERO instruction. -/// -/// \param tile -/// The destination tile to be zero. Max size is 1024 Bytes. -#define _tile_zero(tile) __builtin_ia32_tilezero((tile)) - -/// Compute dot-product of bytes in tiles with a source/destination accumulator. -/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with -/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit -/// results. Sum these 4 results with the corresponding 32-bit integer in "dst", -/// and store the 32-bit result back to tile "dst". -/// -/// \headerfile -/// -/// This intrinsic corresponds to the TDPBSSD instruction. -/// -/// \param dst -/// The destination tile. Max size is 1024 Bytes. -/// \param src0 -/// The 1st source tile. Max size is 1024 Bytes. -/// \param src1 -/// The 2nd source tile. Max size is 1024 Bytes. -#define _tile_dpbssd(dst, src0, src1) \ - __builtin_ia32_tdpbssd((dst), (src0), (src1)) - -/// Compute dot-product of bytes in tiles with a source/destination accumulator. -/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with -/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate -/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer -/// in "dst", and store the 32-bit result back to tile "dst". -/// -/// \headerfile -/// -/// This intrinsic corresponds to the TDPBSUD instruction. -/// -/// \param dst -/// The destination tile. Max size is 1024 Bytes. -/// \param src0 -/// The 1st source tile. Max size is 1024 Bytes. -/// \param src1 -/// The 2nd source tile. Max size is 1024 Bytes. -#define _tile_dpbsud(dst, src0, src1) \ - __builtin_ia32_tdpbsud((dst), (src0), (src1)) - -/// Compute dot-product of bytes in tiles with a source/destination accumulator. -/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with -/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit -/// results. Sum these 4 results with the corresponding 32-bit integer in "dst", -/// and store the 32-bit result back to tile "dst". -/// -/// \headerfile -/// -/// This intrinsic corresponds to the TDPBUSD instruction. -/// -/// \param dst -/// The destination tile. Max size is 1024 Bytes. -/// \param src0 -/// The 1st source tile. Max size is 1024 Bytes. -/// \param src1 -/// The 2nd source tile. Max size is 1024 Bytes. -#define _tile_dpbusd(dst, src0, src1) \ - __builtin_ia32_tdpbusd((dst), (src0), (src1)) - -/// Compute dot-product of bytes in tiles with a source/destination accumulator. -/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with -/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate -/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in -/// "dst", and store the 32-bit result back to tile "dst". -/// -/// \headerfile -/// -/// This intrinsic corresponds to the TDPBUUD instruction. -/// -/// \param dst -/// The destination tile. Max size is 1024 Bytes. -/// \param src0 -/// The 1st source tile. Max size is 1024 Bytes. -/// \param src1 -/// The 2nd source tile. Max size is 1024 Bytes. -#define _tile_dpbuud(dst, src0, src1) \ - __builtin_ia32_tdpbuud((dst), (src0), (src1)) - -/// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and -/// src1, accumulating the intermediate single-precision (32-bit) floating-point -/// elements with elements in "dst", and store the 32-bit result back to tile -/// "dst". -/// -/// \headerfile -/// -/// This intrinsic corresponds to the TDPBF16PS instruction. -/// -/// \param dst -/// The destination tile. Max size is 1024 Bytes. -/// \param src0 -/// The 1st source tile. Max size is 1024 Bytes. -/// \param src1 -/// The 2nd source tile. Max size is 1024 Bytes. -#define _tile_dpbf16ps(dst, src0, src1) \ - __builtin_ia32_tdpbf16ps((dst), (src0), (src1)) - -/// AMX tile register size can be configured, the maximum size is 16x64=1024 -/// bytes. Since there is no 2D type in llvm IR, we use vector type to -/// represent 2D tile and the fixed size is maximum amx tile register size. -typedef int _tile1024i __attribute__((__vector_size__(1024), __aligned__(64))); - -/// This is internal intrinsic. C/C++ user should avoid calling it directly. -static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8 -_tile_loadd_internal(unsigned short m, unsigned short n, const void *base, - __SIZE_TYPE__ stride) { - return __builtin_ia32_tileloadd64_internal(m, n, base, - (__SIZE_TYPE__)(stride)); -} - -/// This is internal intrinsic. C/C++ user should avoid calling it directly. -static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8 -_tile_loaddt1_internal(unsigned short m, unsigned short n, const void *base, - __SIZE_TYPE__ stride) { - return __builtin_ia32_tileloaddt164_internal(m, n, base, - (__SIZE_TYPE__)(stride)); -} - -/// This is internal intrinsic. C/C++ user should avoid calling it directly. -static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8 -_tile_dpbssd_internal(unsigned short m, unsigned short n, unsigned short k, - _tile1024i dst, _tile1024i src1, _tile1024i src2) { - return __builtin_ia32_tdpbssd_internal(m, n, k, dst, src1, src2); -} - -/// This is internal intrinsic. C/C++ user should avoid calling it directly. -static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8 -_tile_dpbsud_internal(unsigned short m, unsigned short n, unsigned short k, - _tile1024i dst, _tile1024i src1, _tile1024i src2) { - return __builtin_ia32_tdpbsud_internal(m, n, k, dst, src1, src2); -} - -/// This is internal intrinsic. C/C++ user should avoid calling it directly. -static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8 -_tile_dpbusd_internal(unsigned short m, unsigned short n, unsigned short k, - _tile1024i dst, _tile1024i src1, _tile1024i src2) { - return __builtin_ia32_tdpbusd_internal(m, n, k, dst, src1, src2); -} - -/// This is internal intrinsic. C/C++ user should avoid calling it directly. -static __inline__ _tile1024i __DEFAULT_FN_ATTRS_INT8 -_tile_dpbuud_internal(unsigned short m, unsigned short n, unsigned short k, - _tile1024i dst, _tile1024i src1, _tile1024i src2) { - return __builtin_ia32_tdpbuud_internal(m, n, k, dst, src1, src2); -} - -/// This is internal intrinsic. C/C++ user should avoid calling it directly. -static __inline__ void __DEFAULT_FN_ATTRS_INT8 -_tile_stored_internal(unsigned short m, unsigned short n, void *base, - __SIZE_TYPE__ stride, _tile1024i tile) { - return __builtin_ia32_tilestored64_internal(m, n, base, - (__SIZE_TYPE__)(stride), tile); -} - -/// This is internal intrinsic. C/C++ user should avoid calling it directly. -static __inline__ _tile1024i __DEFAULT_FN_ATTRS_BF16 -_tile_dpbf16ps_internal(unsigned short m, unsigned short n, unsigned short k, - _tile1024i dst, _tile1024i src1, _tile1024i src2) { - return __builtin_ia32_tdpbf16ps_internal(m, n, k, dst, src1, src2); -} - -/// This struct pack the shape and tile data together for user. We suggest -/// initializing the struct as early as possible, because compiler depends -/// on the shape information to do configure. The constant value is preferred -/// for optimization by compiler. -typedef struct __tile1024i_str { - const unsigned short row; - const unsigned short col; - _tile1024i tile; -} __tile1024i; - -/// Load tile rows from memory specifieid by "base" address and "stride" into -/// destination tile "dst". -/// -/// \headerfile -/// -/// This intrinsic corresponds to the TILELOADD instruction. -/// -/// \param dst -/// A destination tile. Max size is 1024 Bytes. -/// \param base -/// A pointer to base address. -/// \param stride -/// The stride between the rows' data to be loaded in memory. -__DEFAULT_FN_ATTRS_TILE -static __inline__ void __tile_loadd(__tile1024i *dst, const void *base, - __SIZE_TYPE__ stride) { - dst->tile = _tile_loadd_internal(dst->row, dst->col, base, stride); -} - -/// Load tile rows from memory specifieid by "base" address and "stride" into -/// destination tile "dst". This intrinsic provides a hint to the implementation -/// that the data will likely not be reused in the near future and the data -/// caching can be optimized accordingly. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the TILELOADDT1 instruction. -/// -/// \param dst -/// A destination tile. Max size is 1024 Bytes. -/// \param base -/// A pointer to base address. -/// \param stride -/// The stride between the rows' data to be loaded in memory. -__DEFAULT_FN_ATTRS_TILE -static __inline__ void __tile_stream_loadd(__tile1024i *dst, const void *base, - __SIZE_TYPE__ stride) { - dst->tile = _tile_loaddt1_internal(dst->row, dst->col, base, stride); -} - -/// Compute dot-product of bytes in tiles with a source/destination accumulator. -/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with -/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit -/// results. Sum these 4 results with the corresponding 32-bit integer in "dst", -/// and store the 32-bit result back to tile "dst". -/// -/// \headerfile -/// -/// This intrinsic corresponds to the TDPBSSD instruction. -/// -/// \param dst -/// The destination tile. Max size is 1024 Bytes. -/// \param src0 -/// The 1st source tile. Max size is 1024 Bytes. -/// \param src1 -/// The 2nd source tile. Max size is 1024 Bytes. -__DEFAULT_FN_ATTRS_INT8 -static __inline__ void __tile_dpbssd(__tile1024i *dst, __tile1024i src0, - __tile1024i src1) { - dst->tile = _tile_dpbssd_internal(src0.row, src1.col, src0.col, dst->tile, - src0.tile, src1.tile); -} - -/// Compute dot-product of bytes in tiles with a source/destination accumulator. -/// Multiply groups of 4 adjacent pairs of signed 8-bit integers in src0 with -/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate -/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer -/// in "dst", and store the 32-bit result back to tile "dst". -/// -/// \headerfile -/// -/// This intrinsic corresponds to the TDPBSUD instruction. -/// -/// \param dst -/// The destination tile. Max size is 1024 Bytes. -/// \param src0 -/// The 1st source tile. Max size is 1024 Bytes. -/// \param src1 -/// The 2nd source tile. Max size is 1024 Bytes. -__DEFAULT_FN_ATTRS_INT8 -static __inline__ void __tile_dpbsud(__tile1024i *dst, __tile1024i src0, - __tile1024i src1) { - dst->tile = _tile_dpbsud_internal(src0.row, src1.col, src0.col, dst->tile, - src0.tile, src1.tile); -} - -/// Compute dot-product of bytes in tiles with a source/destination accumulator. -/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with -/// corresponding signed 8-bit integers in src1, producing 4 intermediate 32-bit -/// results. Sum these 4 results with the corresponding 32-bit integer in "dst", -/// and store the 32-bit result back to tile "dst". -/// -/// \headerfile -/// -/// This intrinsic corresponds to the TDPBUSD instruction. -/// -/// \param dst -/// The destination tile. Max size is 1024 Bytes. -/// \param src0 -/// The 1st source tile. Max size is 1024 Bytes. -/// \param src1 -/// The 2nd source tile. Max size is 1024 Bytes. -__DEFAULT_FN_ATTRS_INT8 -static __inline__ void __tile_dpbusd(__tile1024i *dst, __tile1024i src0, - __tile1024i src1) { - dst->tile = _tile_dpbusd_internal(src0.row, src1.col, src0.col, dst->tile, - src0.tile, src1.tile); -} - -/// Compute dot-product of bytes in tiles with a source/destination accumulator. -/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in src0 with -/// corresponding unsigned 8-bit integers in src1, producing 4 intermediate -/// 32-bit results. Sum these 4 results with the corresponding 32-bit integer in -/// "dst", and store the 32-bit result back to tile "dst". -/// -/// \headerfile -/// -/// This intrinsic corresponds to the TDPBUUD instruction. -/// -/// \param dst -/// The destination tile. Max size is 1024 Bytes. -/// \param src0 -/// The 1st source tile. Max size is 1024 Bytes. -/// \param src1 -/// The 2nd source tile. Max size is 1024 Bytes. -__DEFAULT_FN_ATTRS_INT8 -static __inline__ void __tile_dpbuud(__tile1024i *dst, __tile1024i src0, - __tile1024i src1) { - dst->tile = _tile_dpbuud_internal(src0.row, src1.col, src0.col, dst->tile, - src0.tile, src1.tile); -} - -/// Store the tile specified by "src" to memory specifieid by "base" address and -/// "stride". -/// -/// \headerfile -/// -/// This intrinsic corresponds to the TILESTORED instruction. -/// -/// \param dst -/// A destination tile. Max size is 1024 Bytes. -/// \param base -/// A pointer to base address. -/// \param stride -/// The stride between the rows' data to be stored in memory. -__DEFAULT_FN_ATTRS_TILE -static __inline__ void __tile_stored(void *base, __SIZE_TYPE__ stride, - __tile1024i src) { - _tile_stored_internal(src.row, src.col, base, stride, src.tile); -} - -/// Zero the tile specified by "dst". -/// -/// \headerfile -/// -/// This intrinsic corresponds to the TILEZERO instruction. -/// -/// \param dst -/// The destination tile to be zero. Max size is 1024 Bytes. -__DEFAULT_FN_ATTRS_TILE -static __inline__ void __tile_zero(__tile1024i *dst) { - dst->tile = __builtin_ia32_tilezero_internal(dst->row, dst->col); -} - -/// Compute dot-product of BF16 (16-bit) floating-point pairs in tiles src0 and -/// src1, accumulating the intermediate single-precision (32-bit) floating-point -/// elements with elements in "dst", and store the 32-bit result back to tile -/// "dst". -/// -/// \headerfile -/// -/// This intrinsic corresponds to the TDPBF16PS instruction. -/// -/// \param dst -/// The destination tile. Max size is 1024 Bytes. -/// \param src0 -/// The 1st source tile. Max size is 1024 Bytes. -/// \param src1 -/// The 2nd source tile. Max size is 1024 Bytes. -__DEFAULT_FN_ATTRS_BF16 -static __inline__ void __tile_dpbf16ps(__tile1024i *dst, __tile1024i src0, - __tile1024i src1) { - dst->tile = _tile_dpbf16ps_internal(src0.row, src1.col, src0.col, dst->tile, - src0.tile, src1.tile); -} - -#undef __DEFAULT_FN_ATTRS_TILE -#undef __DEFAULT_FN_ATTRS_INT8 -#undef __DEFAULT_FN_ATTRS_BF16 - -#endif /* __x86_64__ */ -#endif /* __AMXINTRIN_H */ diff --git a/include/avx2intrin.h b/include/avx2intrin.h deleted file mode 100644 index 38367a3..0000000 --- a/include/avx2intrin.h +++ /dev/null @@ -1,1240 +0,0 @@ -/*===---- avx2intrin.h - AVX2 intrinsics -----------------------------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ - -#ifndef __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __AVX2INTRIN_H -#define __AVX2INTRIN_H - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx2"), __min_vector_width__(256))) -#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx2"), __min_vector_width__(128))) - -/* SSE4 Multiple Packed Sums of Absolute Difference. */ -#define _mm256_mpsadbw_epu8(X, Y, M) \ - ((__m256i)__builtin_ia32_mpsadbw256((__v32qi)(__m256i)(X), \ - (__v32qi)(__m256i)(Y), (int)(M))) - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_abs_epi8(__m256i __a) -{ -#if (__clang_major__ < 14) - return (__m256i)__builtin_ia32_pabsb256((__v32qi)__a); -#else - return (__m256i)__builtin_elementwise_abs((__v32qs)__a); -#endif -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_abs_epi16(__m256i __a) -{ -#if (__clang_major__ < 14) - return (__m256i)__builtin_ia32_pabsw256((__v16hi)__a); -#else - return (__m256i)__builtin_elementwise_abs((__v16hi)__a); -#endif -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_abs_epi32(__m256i __a) -{ -#if (__clang_major__ < 14) - return (__m256i)__builtin_ia32_pabsd256((__v8si)__a); -#else - return (__m256i)__builtin_elementwise_abs((__v8si)__a); -#endif -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_packs_epi16(__m256i __a, __m256i __b) -{ - return (__m256i)__builtin_ia32_packsswb256((__v16hi)__a, (__v16hi)__b); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_packs_epi32(__m256i __a, __m256i __b) -{ - return (__m256i)__builtin_ia32_packssdw256((__v8si)__a, (__v8si)__b); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_packus_epi16(__m256i __a, __m256i __b) -{ - return (__m256i)__builtin_ia32_packuswb256((__v16hi)__a, (__v16hi)__b); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_packus_epi32(__m256i __V1, __m256i __V2) -{ - return (__m256i) __builtin_ia32_packusdw256((__v8si)__V1, (__v8si)__V2); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_add_epi8(__m256i __a, __m256i __b) -{ - return (__m256i)((__v32qu)__a + (__v32qu)__b); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_add_epi16(__m256i __a, __m256i __b) -{ - return (__m256i)((__v16hu)__a + (__v16hu)__b); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_add_epi32(__m256i __a, __m256i __b) -{ - return (__m256i)((__v8su)__a + (__v8su)__b); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_add_epi64(__m256i __a, __m256i __b) -{ - return (__m256i)((__v4du)__a + (__v4du)__b); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_adds_epi8(__m256i __a, __m256i __b) -{ -#if (__clang_major__ > 14) - return (__m256i)__builtin_elementwise_add_sat((__v32qs)__a, (__v32qs)__b); -#else - return (__m256i)__builtin_ia32_paddsb256((__v32qi)__a, (__v32qi)__b); -#endif -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_adds_epi16(__m256i __a, __m256i __b) -{ -#if (__clang_major__ > 14) - return (__m256i)__builtin_elementwise_add_sat((__v16hi)__a, (__v16hi)__b); -#else - return (__m256i)__builtin_ia32_paddsw256((__v16hi)__a, (__v16hi)__b); -#endif -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_adds_epu8(__m256i __a, __m256i __b) -{ -#if (__clang_major__ > 14) - return (__m256i)__builtin_elementwise_add_sat((__v32qu)__a, (__v32qu)__b); -#else - return (__m256i)__builtin_ia32_paddusb256((__v32qi)__a, (__v32qi)__b); -#endif -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_adds_epu16(__m256i __a, __m256i __b) -{ -#if (__clang_major__ > 14) - return (__m256i)__builtin_elementwise_add_sat((__v16hu)__a, (__v16hu)__b); -#else - return (__m256i)__builtin_ia32_paddusw256((__v16hi)__a, (__v16hi)__b); -#endif -} - -#define _mm256_alignr_epi8(a, b, n) \ - ((__m256i)__builtin_ia32_palignr256((__v32qi)(__m256i)(a), \ - (__v32qi)(__m256i)(b), (n))) - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_and_si256(__m256i __a, __m256i __b) -{ - return (__m256i)((__v4du)__a & (__v4du)__b); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_andnot_si256(__m256i __a, __m256i __b) -{ - return (__m256i)(~(__v4du)__a & (__v4du)__b); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_avg_epu8(__m256i __a, __m256i __b) -{ - return (__m256i)__builtin_ia32_pavgb256((__v32qi)__a, (__v32qi)__b); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_avg_epu16(__m256i __a, __m256i __b) -{ - return (__m256i)__builtin_ia32_pavgw256((__v16hi)__a, (__v16hi)__b); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_blendv_epi8(__m256i __V1, __m256i __V2, __m256i __M) -{ - return (__m256i)__builtin_ia32_pblendvb256((__v32qi)__V1, (__v32qi)__V2, - (__v32qi)__M); -} - -#define _mm256_blend_epi16(V1, V2, M) \ - ((__m256i)__builtin_ia32_pblendw256((__v16hi)(__m256i)(V1), \ - (__v16hi)(__m256i)(V2), (int)(M))) - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_cmpeq_epi8(__m256i __a, __m256i __b) -{ - return (__m256i)((__v32qi)__a == (__v32qi)__b); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_cmpeq_epi16(__m256i __a, __m256i __b) -{ - return (__m256i)((__v16hi)__a == (__v16hi)__b); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_cmpeq_epi32(__m256i __a, __m256i __b) -{ - return (__m256i)((__v8si)__a == (__v8si)__b); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_cmpeq_epi64(__m256i __a, __m256i __b) -{ - return (__m256i)((__v4di)__a == (__v4di)__b); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_cmpgt_epi8(__m256i __a, __m256i __b) -{ - /* This function always performs a signed comparison, but __v32qi is a char - which may be signed or unsigned, so use __v32qs. */ - return (__m256i)((__v32qs)__a > (__v32qs)__b); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_cmpgt_epi16(__m256i __a, __m256i __b) -{ - return (__m256i)((__v16hi)__a > (__v16hi)__b); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_cmpgt_epi32(__m256i __a, __m256i __b) -{ - return (__m256i)((__v8si)__a > (__v8si)__b); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_cmpgt_epi64(__m256i __a, __m256i __b) -{ - return (__m256i)((__v4di)__a > (__v4di)__b); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_hadd_epi16(__m256i __a, __m256i __b) -{ - return (__m256i)__builtin_ia32_phaddw256((__v16hi)__a, (__v16hi)__b); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_hadd_epi32(__m256i __a, __m256i __b) -{ - return (__m256i)__builtin_ia32_phaddd256((__v8si)__a, (__v8si)__b); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_hadds_epi16(__m256i __a, __m256i __b) -{ - return (__m256i)__builtin_ia32_phaddsw256((__v16hi)__a, (__v16hi)__b); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_hsub_epi16(__m256i __a, __m256i __b) -{ - return (__m256i)__builtin_ia32_phsubw256((__v16hi)__a, (__v16hi)__b); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_hsub_epi32(__m256i __a, __m256i __b) -{ - return (__m256i)__builtin_ia32_phsubd256((__v8si)__a, (__v8si)__b); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_hsubs_epi16(__m256i __a, __m256i __b) -{ - return (__m256i)__builtin_ia32_phsubsw256((__v16hi)__a, (__v16hi)__b); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maddubs_epi16(__m256i __a, __m256i __b) -{ - return (__m256i)__builtin_ia32_pmaddubsw256((__v32qi)__a, (__v32qi)__b); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_madd_epi16(__m256i __a, __m256i __b) -{ - return (__m256i)__builtin_ia32_pmaddwd256((__v16hi)__a, (__v16hi)__b); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_max_epi8(__m256i __a, __m256i __b) -{ -#if (__clang_major__ < 14) - return (__m256i)__builtin_ia32_pmaxsb256((__v32qi)__a, (__v32qi)__b); -#else - return (__m256i)__builtin_elementwise_max((__v32qs)__a, (__v32qs)__b); -#endif -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_max_epi16(__m256i __a, __m256i __b) -{ -#if (__clang_major__ < 14) - return (__m256i)__builtin_ia32_pmaxsw256((__v16hi)__a, (__v16hi)__b); -#else - return (__m256i)__builtin_elementwise_max((__v16hi)__a, (__v16hi)__b); -#endif -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_max_epi32(__m256i __a, __m256i __b) -{ -#if (__clang_major__ < 14) - return (__m256i)__builtin_ia32_pmaxsd256((__v8si)__a, (__v8si)__b); -#else - return (__m256i)__builtin_elementwise_max((__v8si)__a, (__v8si)__b); -#endif -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_max_epu8(__m256i __a, __m256i __b) -{ -#if (__clang_major__ < 14) - return (__m256i)__builtin_ia32_pmaxub256((__v32qi)__a, (__v32qi)__b); -#else - return (__m256i)__builtin_elementwise_max((__v32qu)__a, (__v32qu)__b); -#endif -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_max_epu16(__m256i __a, __m256i __b) -{ -#if (__clang_major__ < 14) - return (__m256i)__builtin_ia32_pmaxuw256((__v16hi)__a, (__v16hi)__b); -#else - return (__m256i)__builtin_elementwise_max((__v16hu)__a, (__v16hu)__b); -#endif -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_max_epu32(__m256i __a, __m256i __b) -{ -#if (__clang_major__ < 14) - return (__m256i)__builtin_ia32_pmaxud256((__v8si)__a, (__v8si)__b); -#else - return (__m256i)__builtin_elementwise_max((__v8su)__a, (__v8su)__b); -#endif -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_min_epi8(__m256i __a, __m256i __b) -{ -#if (__clang_major__ < 14) - return (__m256i)__builtin_ia32_pminsb256((__v32qi)__a, (__v32qi)__b); -#else - return (__m256i)__builtin_elementwise_min((__v32qs)__a, (__v32qs)__b); -#endif -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_min_epi16(__m256i __a, __m256i __b) -{ -#if (__clang_major__ < 14) - return (__m256i)__builtin_ia32_pminsw256((__v16hi)__a, (__v16hi)__b); -#else - return (__m256i)__builtin_elementwise_min((__v16hi)__a, (__v16hi)__b); -#endif -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_min_epi32(__m256i __a, __m256i __b) -{ -#if (__clang_major__ < 14) - return (__m256i)__builtin_ia32_pminsd256((__v8si)__a, (__v8si)__b); -#else - return (__m256i)__builtin_elementwise_min((__v8si)__a, (__v8si)__b); -#endif -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_min_epu8(__m256i __a, __m256i __b) -{ -#if (__clang_major__ < 14) - return (__m256i)__builtin_ia32_pminub256((__v32qi)__a, (__v32qi)__b); -#else - return (__m256i)__builtin_elementwise_min((__v32qu)__a, (__v32qu)__b); -#endif -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_min_epu16(__m256i __a, __m256i __b) -{ -#if (__clang_major__ < 14) - return (__m256i)__builtin_ia32_pminuw256 ((__v16hi)__a, (__v16hi)__b); -#else - return (__m256i)__builtin_elementwise_min((__v16hu)__a, (__v16hu)__b); -#endif -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_min_epu32(__m256i __a, __m256i __b) -{ -#if (__clang_major__ < 14) - return (__m256i)__builtin_ia32_pminud256((__v8si)__a, (__v8si)__b); -#else - return (__m256i)__builtin_elementwise_min((__v8su)__a, (__v8su)__b); -#endif -} - -static __inline__ int __DEFAULT_FN_ATTRS256 -_mm256_movemask_epi8(__m256i __a) -{ - return __builtin_ia32_pmovmskb256((__v32qi)__a); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_cvtepi8_epi16(__m128i __V) -{ - /* This function always performs a signed extension, but __v16qi is a char - which may be signed or unsigned, so use __v16qs. */ - return (__m256i)__builtin_convertvector((__v16qs)__V, __v16hi); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_cvtepi8_epi32(__m128i __V) -{ - /* This function always performs a signed extension, but __v16qi is a char - which may be signed or unsigned, so use __v16qs. */ - return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_cvtepi8_epi64(__m128i __V) -{ - /* This function always performs a signed extension, but __v16qi is a char - which may be signed or unsigned, so use __v16qs. */ - return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4di); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_cvtepi16_epi32(__m128i __V) -{ - return (__m256i)__builtin_convertvector((__v8hi)__V, __v8si); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_cvtepi16_epi64(__m128i __V) -{ - return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4di); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_cvtepi32_epi64(__m128i __V) -{ - return (__m256i)__builtin_convertvector((__v4si)__V, __v4di); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_cvtepu8_epi16(__m128i __V) -{ - return (__m256i)__builtin_convertvector((__v16qu)__V, __v16hi); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_cvtepu8_epi32(__m128i __V) -{ - return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8si); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_cvtepu8_epi64(__m128i __V) -{ - return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4di); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_cvtepu16_epi32(__m128i __V) -{ - return (__m256i)__builtin_convertvector((__v8hu)__V, __v8si); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_cvtepu16_epi64(__m128i __V) -{ - return (__m256i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4di); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_cvtepu32_epi64(__m128i __V) -{ - return (__m256i)__builtin_convertvector((__v4su)__V, __v4di); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mul_epi32(__m256i __a, __m256i __b) -{ - return (__m256i)__builtin_ia32_pmuldq256((__v8si)__a, (__v8si)__b); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mulhrs_epi16(__m256i __a, __m256i __b) -{ - return (__m256i)__builtin_ia32_pmulhrsw256((__v16hi)__a, (__v16hi)__b); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mulhi_epu16(__m256i __a, __m256i __b) -{ - return (__m256i)__builtin_ia32_pmulhuw256((__v16hi)__a, (__v16hi)__b); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mulhi_epi16(__m256i __a, __m256i __b) -{ - return (__m256i)__builtin_ia32_pmulhw256((__v16hi)__a, (__v16hi)__b); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mullo_epi16(__m256i __a, __m256i __b) -{ - return (__m256i)((__v16hu)__a * (__v16hu)__b); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mullo_epi32 (__m256i __a, __m256i __b) -{ - return (__m256i)((__v8su)__a * (__v8su)__b); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mul_epu32(__m256i __a, __m256i __b) -{ - return __builtin_ia32_pmuludq256((__v8si)__a, (__v8si)__b); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_or_si256(__m256i __a, __m256i __b) -{ - return (__m256i)((__v4du)__a | (__v4du)__b); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_sad_epu8(__m256i __a, __m256i __b) -{ - return __builtin_ia32_psadbw256((__v32qi)__a, (__v32qi)__b); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_shuffle_epi8(__m256i __a, __m256i __b) -{ - return (__m256i)__builtin_ia32_pshufb256((__v32qi)__a, (__v32qi)__b); -} - -#define _mm256_shuffle_epi32(a, imm) \ - ((__m256i)__builtin_ia32_pshufd256((__v8si)(__m256i)(a), (int)(imm))) - -#define _mm256_shufflehi_epi16(a, imm) \ - ((__m256i)__builtin_ia32_pshufhw256((__v16hi)(__m256i)(a), (int)(imm))) - -#define _mm256_shufflelo_epi16(a, imm) \ - ((__m256i)__builtin_ia32_pshuflw256((__v16hi)(__m256i)(a), (int)(imm))) - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_sign_epi8(__m256i __a, __m256i __b) -{ - return (__m256i)__builtin_ia32_psignb256((__v32qi)__a, (__v32qi)__b); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_sign_epi16(__m256i __a, __m256i __b) -{ - return (__m256i)__builtin_ia32_psignw256((__v16hi)__a, (__v16hi)__b); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_sign_epi32(__m256i __a, __m256i __b) -{ - return (__m256i)__builtin_ia32_psignd256((__v8si)__a, (__v8si)__b); -} - -#define _mm256_slli_si256(a, imm) \ - ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm))) - -#define _mm256_bslli_epi128(a, imm) \ - ((__m256i)__builtin_ia32_pslldqi256_byteshift((__v4di)(__m256i)(a), (int)(imm))) - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_slli_epi16(__m256i __a, int __count) -{ - return (__m256i)__builtin_ia32_psllwi256((__v16hi)__a, __count); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_sll_epi16(__m256i __a, __m128i __count) -{ - return (__m256i)__builtin_ia32_psllw256((__v16hi)__a, (__v8hi)__count); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_slli_epi32(__m256i __a, int __count) -{ - return (__m256i)__builtin_ia32_pslldi256((__v8si)__a, __count); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_sll_epi32(__m256i __a, __m128i __count) -{ - return (__m256i)__builtin_ia32_pslld256((__v8si)__a, (__v4si)__count); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_slli_epi64(__m256i __a, int __count) -{ - return __builtin_ia32_psllqi256((__v4di)__a, __count); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_sll_epi64(__m256i __a, __m128i __count) -{ - return __builtin_ia32_psllq256((__v4di)__a, __count); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_srai_epi16(__m256i __a, int __count) -{ - return (__m256i)__builtin_ia32_psrawi256((__v16hi)__a, __count); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_sra_epi16(__m256i __a, __m128i __count) -{ - return (__m256i)__builtin_ia32_psraw256((__v16hi)__a, (__v8hi)__count); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_srai_epi32(__m256i __a, int __count) -{ - return (__m256i)__builtin_ia32_psradi256((__v8si)__a, __count); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_sra_epi32(__m256i __a, __m128i __count) -{ - return (__m256i)__builtin_ia32_psrad256((__v8si)__a, (__v4si)__count); -} - -#define _mm256_srli_si256(a, imm) \ - ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm))) - -#define _mm256_bsrli_epi128(a, imm) \ - ((__m256i)__builtin_ia32_psrldqi256_byteshift((__m256i)(a), (int)(imm))) - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_srli_epi16(__m256i __a, int __count) -{ - return (__m256i)__builtin_ia32_psrlwi256((__v16hi)__a, __count); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_srl_epi16(__m256i __a, __m128i __count) -{ - return (__m256i)__builtin_ia32_psrlw256((__v16hi)__a, (__v8hi)__count); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_srli_epi32(__m256i __a, int __count) -{ - return (__m256i)__builtin_ia32_psrldi256((__v8si)__a, __count); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_srl_epi32(__m256i __a, __m128i __count) -{ - return (__m256i)__builtin_ia32_psrld256((__v8si)__a, (__v4si)__count); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_srli_epi64(__m256i __a, int __count) -{ - return __builtin_ia32_psrlqi256((__v4di)__a, __count); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_srl_epi64(__m256i __a, __m128i __count) -{ - return __builtin_ia32_psrlq256((__v4di)__a, __count); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_sub_epi8(__m256i __a, __m256i __b) -{ - return (__m256i)((__v32qu)__a - (__v32qu)__b); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_sub_epi16(__m256i __a, __m256i __b) -{ - return (__m256i)((__v16hu)__a - (__v16hu)__b); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_sub_epi32(__m256i __a, __m256i __b) -{ - return (__m256i)((__v8su)__a - (__v8su)__b); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_sub_epi64(__m256i __a, __m256i __b) -{ - return (__m256i)((__v4du)__a - (__v4du)__b); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_subs_epi8(__m256i __a, __m256i __b) -{ -#if (__clang_major__ > 14) - return (__m256i)__builtin_elementwise_sub_sat((__v32qs)__a, (__v32qs)__b); -#else - return (__m256i)__builtin_ia32_psubsb256((__v32qi)__a, (__v32qi)__b); -#endif -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_subs_epi16(__m256i __a, __m256i __b) -{ -#if (__clang_major__ > 14) - return (__m256i)__builtin_elementwise_sub_sat((__v16hi)__a, (__v16hi)__b); -#else - return (__m256i)__builtin_ia32_psubsw256((__v16hi)__a, (__v16hi)__b); -#endif -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_subs_epu8(__m256i __a, __m256i __b) -{ -#if (__clang_major__ > 14) - return (__m256i)__builtin_elementwise_sub_sat((__v32qu)__a, (__v32qu)__b); -#else - return (__m256i)__builtin_ia32_psubusb256((__v32qi)__a, (__v32qi)__b); -#endif -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_subs_epu16(__m256i __a, __m256i __b) -{ -#if (__clang_major__ > 14) - return (__m256i)__builtin_elementwise_sub_sat((__v16hu)__a, (__v16hu)__b); -#else - return (__m256i)__builtin_ia32_psubusw256((__v16hi)__a, (__v16hi)__b); -#endif -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_unpackhi_epi8(__m256i __a, __m256i __b) -{ - return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 8, 32+8, 9, 32+9, 10, 32+10, 11, 32+11, 12, 32+12, 13, 32+13, 14, 32+14, 15, 32+15, 24, 32+24, 25, 32+25, 26, 32+26, 27, 32+27, 28, 32+28, 29, 32+29, 30, 32+30, 31, 32+31); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_unpackhi_epi16(__m256i __a, __m256i __b) -{ - return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_unpackhi_epi32(__m256i __a, __m256i __b) -{ - return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 2, 8+2, 3, 8+3, 6, 8+6, 7, 8+7); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_unpackhi_epi64(__m256i __a, __m256i __b) -{ - return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 1, 4+1, 3, 4+3); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_unpacklo_epi8(__m256i __a, __m256i __b) -{ - return (__m256i)__builtin_shufflevector((__v32qi)__a, (__v32qi)__b, 0, 32+0, 1, 32+1, 2, 32+2, 3, 32+3, 4, 32+4, 5, 32+5, 6, 32+6, 7, 32+7, 16, 32+16, 17, 32+17, 18, 32+18, 19, 32+19, 20, 32+20, 21, 32+21, 22, 32+22, 23, 32+23); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_unpacklo_epi16(__m256i __a, __m256i __b) -{ - return (__m256i)__builtin_shufflevector((__v16hi)__a, (__v16hi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_unpacklo_epi32(__m256i __a, __m256i __b) -{ - return (__m256i)__builtin_shufflevector((__v8si)__a, (__v8si)__b, 0, 8+0, 1, 8+1, 4, 8+4, 5, 8+5); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_unpacklo_epi64(__m256i __a, __m256i __b) -{ - return (__m256i)__builtin_shufflevector((__v4di)__a, (__v4di)__b, 0, 4+0, 2, 4+2); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_xor_si256(__m256i __a, __m256i __b) -{ - return (__m256i)((__v4du)__a ^ (__v4du)__b); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_stream_load_si256(__m256i const *__V) -{ - typedef __v4di __v4di_aligned __attribute__((aligned(32))); - return (__m256i)__builtin_nontemporal_load((const __v4di_aligned *)__V); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_broadcastss_ps(__m128 __X) -{ - return (__m128)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_broadcastsd_pd(__m128d __a) -{ - return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_broadcastss_ps(__m128 __X) -{ - return (__m256)__builtin_shufflevector((__v4sf)__X, (__v4sf)__X, 0, 0, 0, 0, 0, 0, 0, 0); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_broadcastsd_pd(__m128d __X) -{ - return (__m256d)__builtin_shufflevector((__v2df)__X, (__v2df)__X, 0, 0, 0, 0); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_broadcastsi128_si256(__m128i __X) -{ - return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 1, 0, 1); -} - -#define _mm_broadcastsi128_si256(X) _mm256_broadcastsi128_si256(X) - -#define _mm_blend_epi32(V1, V2, M) \ - ((__m128i)__builtin_ia32_pblendd128((__v4si)(__m128i)(V1), \ - (__v4si)(__m128i)(V2), (int)(M))) - -#define _mm256_blend_epi32(V1, V2, M) \ - ((__m256i)__builtin_ia32_pblendd256((__v8si)(__m256i)(V1), \ - (__v8si)(__m256i)(V2), (int)(M))) - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_broadcastb_epi8(__m128i __X) -{ - return (__m256i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_broadcastw_epi16(__m128i __X) -{ - return (__m256i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_broadcastd_epi32(__m128i __X) -{ - return (__m256i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0, 0, 0, 0, 0); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_broadcastq_epi64(__m128i __X) -{ - return (__m256i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0, 0, 0); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_broadcastb_epi8(__m128i __X) -{ - return (__m128i)__builtin_shufflevector((__v16qi)__X, (__v16qi)__X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_broadcastw_epi16(__m128i __X) -{ - return (__m128i)__builtin_shufflevector((__v8hi)__X, (__v8hi)__X, 0, 0, 0, 0, 0, 0, 0, 0); -} - - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_broadcastd_epi32(__m128i __X) -{ - return (__m128i)__builtin_shufflevector((__v4si)__X, (__v4si)__X, 0, 0, 0, 0); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_broadcastq_epi64(__m128i __X) -{ - return (__m128i)__builtin_shufflevector((__v2di)__X, (__v2di)__X, 0, 0); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_permutevar8x32_epi32(__m256i __a, __m256i __b) -{ - return (__m256i)__builtin_ia32_permvarsi256((__v8si)__a, (__v8si)__b); -} - -#define _mm256_permute4x64_pd(V, M) \ - ((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(V), (int)(M))) - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_permutevar8x32_ps(__m256 __a, __m256i __b) -{ - return (__m256)__builtin_ia32_permvarsf256((__v8sf)__a, (__v8si)__b); -} - -#define _mm256_permute4x64_epi64(V, M) \ - ((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(V), (int)(M))) - -#define _mm256_permute2x128_si256(V1, V2, M) \ - ((__m256i)__builtin_ia32_permti256((__m256i)(V1), (__m256i)(V2), (int)(M))) - -#define _mm256_extracti128_si256(V, M) \ - ((__m128i)__builtin_ia32_extract128i256((__v4di)(__m256i)(V), (int)(M))) - -#define _mm256_inserti128_si256(V1, V2, M) \ - ((__m256i)__builtin_ia32_insert128i256((__v4di)(__m256i)(V1), \ - (__v2di)(__m128i)(V2), (int)(M))) - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskload_epi32(int const *__X, __m256i __M) -{ - return (__m256i)__builtin_ia32_maskloadd256((const __v8si *)__X, (__v8si)__M); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskload_epi64(long long const *__X, __m256i __M) -{ - return (__m256i)__builtin_ia32_maskloadq256((const __v4di *)__X, (__v4di)__M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskload_epi32(int const *__X, __m128i __M) -{ - return (__m128i)__builtin_ia32_maskloadd((const __v4si *)__X, (__v4si)__M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskload_epi64(long long const *__X, __m128i __M) -{ - return (__m128i)__builtin_ia32_maskloadq((const __v2di *)__X, (__v2di)__M); -} - -static __inline__ void __DEFAULT_FN_ATTRS256 -_mm256_maskstore_epi32(int *__X, __m256i __M, __m256i __Y) -{ - __builtin_ia32_maskstored256((__v8si *)__X, (__v8si)__M, (__v8si)__Y); -} - -static __inline__ void __DEFAULT_FN_ATTRS256 -_mm256_maskstore_epi64(long long *__X, __m256i __M, __m256i __Y) -{ - __builtin_ia32_maskstoreq256((__v4di *)__X, (__v4di)__M, (__v4di)__Y); -} - -static __inline__ void __DEFAULT_FN_ATTRS128 -_mm_maskstore_epi32(int *__X, __m128i __M, __m128i __Y) -{ - __builtin_ia32_maskstored((__v4si *)__X, (__v4si)__M, (__v4si)__Y); -} - -static __inline__ void __DEFAULT_FN_ATTRS128 -_mm_maskstore_epi64(long long *__X, __m128i __M, __m128i __Y) -{ - __builtin_ia32_maskstoreq(( __v2di *)__X, (__v2di)__M, (__v2di)__Y); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_sllv_epi32(__m256i __X, __m256i __Y) -{ - return (__m256i)__builtin_ia32_psllv8si((__v8si)__X, (__v8si)__Y); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_sllv_epi32(__m128i __X, __m128i __Y) -{ - return (__m128i)__builtin_ia32_psllv4si((__v4si)__X, (__v4si)__Y); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_sllv_epi64(__m256i __X, __m256i __Y) -{ - return (__m256i)__builtin_ia32_psllv4di((__v4di)__X, (__v4di)__Y); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_sllv_epi64(__m128i __X, __m128i __Y) -{ - return (__m128i)__builtin_ia32_psllv2di((__v2di)__X, (__v2di)__Y); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_srav_epi32(__m256i __X, __m256i __Y) -{ - return (__m256i)__builtin_ia32_psrav8si((__v8si)__X, (__v8si)__Y); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_srav_epi32(__m128i __X, __m128i __Y) -{ - return (__m128i)__builtin_ia32_psrav4si((__v4si)__X, (__v4si)__Y); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_srlv_epi32(__m256i __X, __m256i __Y) -{ - return (__m256i)__builtin_ia32_psrlv8si((__v8si)__X, (__v8si)__Y); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_srlv_epi32(__m128i __X, __m128i __Y) -{ - return (__m128i)__builtin_ia32_psrlv4si((__v4si)__X, (__v4si)__Y); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_srlv_epi64(__m256i __X, __m256i __Y) -{ - return (__m256i)__builtin_ia32_psrlv4di((__v4di)__X, (__v4di)__Y); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_srlv_epi64(__m128i __X, __m128i __Y) -{ - return (__m128i)__builtin_ia32_psrlv2di((__v2di)__X, (__v2di)__Y); -} - -#define _mm_mask_i32gather_pd(a, m, i, mask, s) \ - ((__m128d)__builtin_ia32_gatherd_pd((__v2df)(__m128i)(a), \ - (double const *)(m), \ - (__v4si)(__m128i)(i), \ - (__v2df)(__m128d)(mask), (s))) - -#define _mm256_mask_i32gather_pd(a, m, i, mask, s) \ - ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)(__m256d)(a), \ - (double const *)(m), \ - (__v4si)(__m128i)(i), \ - (__v4df)(__m256d)(mask), (s))) - -#define _mm_mask_i64gather_pd(a, m, i, mask, s) \ - ((__m128d)__builtin_ia32_gatherq_pd((__v2df)(__m128d)(a), \ - (double const *)(m), \ - (__v2di)(__m128i)(i), \ - (__v2df)(__m128d)(mask), (s))) - -#define _mm256_mask_i64gather_pd(a, m, i, mask, s) \ - ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)(__m256d)(a), \ - (double const *)(m), \ - (__v4di)(__m256i)(i), \ - (__v4df)(__m256d)(mask), (s))) - -#define _mm_mask_i32gather_ps(a, m, i, mask, s) \ - ((__m128)__builtin_ia32_gatherd_ps((__v4sf)(__m128)(a), \ - (float const *)(m), \ - (__v4si)(__m128i)(i), \ - (__v4sf)(__m128)(mask), (s))) - -#define _mm256_mask_i32gather_ps(a, m, i, mask, s) \ - ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)(__m256)(a), \ - (float const *)(m), \ - (__v8si)(__m256i)(i), \ - (__v8sf)(__m256)(mask), (s))) - -#define _mm_mask_i64gather_ps(a, m, i, mask, s) \ - ((__m128)__builtin_ia32_gatherq_ps((__v4sf)(__m128)(a), \ - (float const *)(m), \ - (__v2di)(__m128i)(i), \ - (__v4sf)(__m128)(mask), (s))) - -#define _mm256_mask_i64gather_ps(a, m, i, mask, s) \ - ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)(__m128)(a), \ - (float const *)(m), \ - (__v4di)(__m256i)(i), \ - (__v4sf)(__m128)(mask), (s))) - -#define _mm_mask_i32gather_epi32(a, m, i, mask, s) \ - ((__m128i)__builtin_ia32_gatherd_d((__v4si)(__m128i)(a), \ - (int const *)(m), \ - (__v4si)(__m128i)(i), \ - (__v4si)(__m128i)(mask), (s))) - -#define _mm256_mask_i32gather_epi32(a, m, i, mask, s) \ - ((__m256i)__builtin_ia32_gatherd_d256((__v8si)(__m256i)(a), \ - (int const *)(m), \ - (__v8si)(__m256i)(i), \ - (__v8si)(__m256i)(mask), (s))) - -#define _mm_mask_i64gather_epi32(a, m, i, mask, s) \ - ((__m128i)__builtin_ia32_gatherq_d((__v4si)(__m128i)(a), \ - (int const *)(m), \ - (__v2di)(__m128i)(i), \ - (__v4si)(__m128i)(mask), (s))) - -#define _mm256_mask_i64gather_epi32(a, m, i, mask, s) \ - ((__m128i)__builtin_ia32_gatherq_d256((__v4si)(__m128i)(a), \ - (int const *)(m), \ - (__v4di)(__m256i)(i), \ - (__v4si)(__m128i)(mask), (s))) - -#define _mm_mask_i32gather_epi64(a, m, i, mask, s) \ - ((__m128i)__builtin_ia32_gatherd_q((__v2di)(__m128i)(a), \ - (long long const *)(m), \ - (__v4si)(__m128i)(i), \ - (__v2di)(__m128i)(mask), (s))) - -#define _mm256_mask_i32gather_epi64(a, m, i, mask, s) \ - ((__m256i)__builtin_ia32_gatherd_q256((__v4di)(__m256i)(a), \ - (long long const *)(m), \ - (__v4si)(__m128i)(i), \ - (__v4di)(__m256i)(mask), (s))) - -#define _mm_mask_i64gather_epi64(a, m, i, mask, s) \ - ((__m128i)__builtin_ia32_gatherq_q((__v2di)(__m128i)(a), \ - (long long const *)(m), \ - (__v2di)(__m128i)(i), \ - (__v2di)(__m128i)(mask), (s))) - -#define _mm256_mask_i64gather_epi64(a, m, i, mask, s) \ - ((__m256i)__builtin_ia32_gatherq_q256((__v4di)(__m256i)(a), \ - (long long const *)(m), \ - (__v4di)(__m256i)(i), \ - (__v4di)(__m256i)(mask), (s))) - -#define _mm_i32gather_pd(m, i, s) \ - ((__m128d)__builtin_ia32_gatherd_pd((__v2df)_mm_undefined_pd(), \ - (double const *)(m), \ - (__v4si)(__m128i)(i), \ - (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \ - _mm_setzero_pd()), \ - (s))) - -#define _mm256_i32gather_pd(m, i, s) \ - ((__m256d)__builtin_ia32_gatherd_pd256((__v4df)_mm256_undefined_pd(), \ - (double const *)(m), \ - (__v4si)(__m128i)(i), \ - (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \ - _mm256_setzero_pd(), \ - _CMP_EQ_OQ), \ - (s))) - -#define _mm_i64gather_pd(m, i, s) \ - ((__m128d)__builtin_ia32_gatherq_pd((__v2df)_mm_undefined_pd(), \ - (double const *)(m), \ - (__v2di)(__m128i)(i), \ - (__v2df)_mm_cmpeq_pd(_mm_setzero_pd(), \ - _mm_setzero_pd()), \ - (s))) - -#define _mm256_i64gather_pd(m, i, s) \ - ((__m256d)__builtin_ia32_gatherq_pd256((__v4df)_mm256_undefined_pd(), \ - (double const *)(m), \ - (__v4di)(__m256i)(i), \ - (__v4df)_mm256_cmp_pd(_mm256_setzero_pd(), \ - _mm256_setzero_pd(), \ - _CMP_EQ_OQ), \ - (s))) - -#define _mm_i32gather_ps(m, i, s) \ - ((__m128)__builtin_ia32_gatherd_ps((__v4sf)_mm_undefined_ps(), \ - (float const *)(m), \ - (__v4si)(__m128i)(i), \ - (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \ - _mm_setzero_ps()), \ - (s))) - -#define _mm256_i32gather_ps(m, i, s) \ - ((__m256)__builtin_ia32_gatherd_ps256((__v8sf)_mm256_undefined_ps(), \ - (float const *)(m), \ - (__v8si)(__m256i)(i), \ - (__v8sf)_mm256_cmp_ps(_mm256_setzero_ps(), \ - _mm256_setzero_ps(), \ - _CMP_EQ_OQ), \ - (s))) - -#define _mm_i64gather_ps(m, i, s) \ - ((__m128)__builtin_ia32_gatherq_ps((__v4sf)_mm_undefined_ps(), \ - (float const *)(m), \ - (__v2di)(__m128i)(i), \ - (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \ - _mm_setzero_ps()), \ - (s))) - -#define _mm256_i64gather_ps(m, i, s) \ - ((__m128)__builtin_ia32_gatherq_ps256((__v4sf)_mm_undefined_ps(), \ - (float const *)(m), \ - (__v4di)(__m256i)(i), \ - (__v4sf)_mm_cmpeq_ps(_mm_setzero_ps(), \ - _mm_setzero_ps()), \ - (s))) - -#define _mm_i32gather_epi32(m, i, s) \ - ((__m128i)__builtin_ia32_gatherd_d((__v4si)_mm_undefined_si128(), \ - (int const *)(m), (__v4si)(__m128i)(i), \ - (__v4si)_mm_set1_epi32(-1), (s))) - -#define _mm256_i32gather_epi32(m, i, s) \ - ((__m256i)__builtin_ia32_gatherd_d256((__v8si)_mm256_undefined_si256(), \ - (int const *)(m), (__v8si)(__m256i)(i), \ - (__v8si)_mm256_set1_epi32(-1), (s))) - -#define _mm_i64gather_epi32(m, i, s) \ - ((__m128i)__builtin_ia32_gatherq_d((__v4si)_mm_undefined_si128(), \ - (int const *)(m), (__v2di)(__m128i)(i), \ - (__v4si)_mm_set1_epi32(-1), (s))) - -#define _mm256_i64gather_epi32(m, i, s) \ - ((__m128i)__builtin_ia32_gatherq_d256((__v4si)_mm_undefined_si128(), \ - (int const *)(m), (__v4di)(__m256i)(i), \ - (__v4si)_mm_set1_epi32(-1), (s))) - -#define _mm_i32gather_epi64(m, i, s) \ - ((__m128i)__builtin_ia32_gatherd_q((__v2di)_mm_undefined_si128(), \ - (long long const *)(m), \ - (__v4si)(__m128i)(i), \ - (__v2di)_mm_set1_epi64x(-1), (s))) - -#define _mm256_i32gather_epi64(m, i, s) \ - ((__m256i)__builtin_ia32_gatherd_q256((__v4di)_mm256_undefined_si256(), \ - (long long const *)(m), \ - (__v4si)(__m128i)(i), \ - (__v4di)_mm256_set1_epi64x(-1), (s))) - -#define _mm_i64gather_epi64(m, i, s) \ - ((__m128i)__builtin_ia32_gatherq_q((__v2di)_mm_undefined_si128(), \ - (long long const *)(m), \ - (__v2di)(__m128i)(i), \ - (__v2di)_mm_set1_epi64x(-1), (s))) - -#define _mm256_i64gather_epi64(m, i, s) \ - ((__m256i)__builtin_ia32_gatherq_q256((__v4di)_mm256_undefined_si256(), \ - (long long const *)(m), \ - (__v4di)(__m256i)(i), \ - (__v4di)_mm256_set1_epi64x(-1), (s))) - -#undef __DEFAULT_FN_ATTRS256 -#undef __DEFAULT_FN_ATTRS128 - -#endif /* __AVX2INTRIN_H */ diff --git a/include/avx512bf16intrin.h b/include/avx512bf16intrin.h deleted file mode 100644 index eef0fc3..0000000 --- a/include/avx512bf16intrin.h +++ /dev/null @@ -1,285 +0,0 @@ -/*===------------ avx512bf16intrin.h - AVX512_BF16 intrinsics --------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ -#ifndef __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __AVX512BF16INTRIN_H -#define __AVX512BF16INTRIN_H - -#if (__clang_major__ > 15) -typedef __bf16 __v32bf __attribute__((__vector_size__(64), __aligned__(64))); -typedef __bf16 __m512bh __attribute__((__vector_size__(64), __aligned__(64))); -typedef __bf16 __bfloat16; -#else -typedef short __m512bh __attribute__((__vector_size__(64), __aligned__(64))); -typedef short __m256bh __attribute__((__vector_size__(32), __aligned__(32))); -typedef unsigned short __bfloat16; -#endif - -#define __DEFAULT_FN_ATTRS512 \ - __attribute__((__always_inline__, __nodebug__, __target__("avx512bf16"), \ - __min_vector_width__(512))) -#define __DEFAULT_FN_ATTRS \ - __attribute__((__always_inline__, __nodebug__, __target__("avx512bf16"))) - -/// Convert One BF16 Data to One Single Float Data. -/// -/// \headerfile -/// -/// This intrinsic does not correspond to a specific instruction. -/// -/// \param __A -/// A bfloat data. -/// \returns A float data whose sign field and exponent field keep unchanged, -/// and fraction field is extended to 23 bits. -static __inline__ float __DEFAULT_FN_ATTRS _mm_cvtsbh_ss(__bfloat16 __A) { - return __builtin_ia32_cvtsbf162ss_32(__A); -} - -/// Convert Two Packed Single Data to One Packed BF16 Data. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCVTNE2PS2BF16 instructions. -/// -/// \param __A -/// A 512-bit vector of [16 x float]. -/// \param __B -/// A 512-bit vector of [16 x float]. -/// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from -/// conversion of __B, and higher 256 bits come from conversion of __A. -static __inline__ __m512bh __DEFAULT_FN_ATTRS512 -_mm512_cvtne2ps_pbh(__m512 __A, __m512 __B) { - return (__m512bh)__builtin_ia32_cvtne2ps2bf16_512((__v16sf) __A, - (__v16sf) __B); -} - -/// Convert Two Packed Single Data to One Packed BF16 Data. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCVTNE2PS2BF16 instructions. -/// -/// \param __A -/// A 512-bit vector of [16 x float]. -/// \param __B -/// A 512-bit vector of [16 x float]. -/// \param __W -/// A 512-bit vector of [32 x bfloat]. -/// \param __U -/// A 32-bit mask value specifying what is chosen for each element. -/// A 1 means conversion of __A or __B. A 0 means element from __W. -/// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from -/// conversion of __B, and higher 256 bits come from conversion of __A. -static __inline__ __m512bh __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtne2ps_pbh(__m512bh __W, __mmask32 __U, __m512 __A, __m512 __B) { - return (__m512bh)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_cvtne2ps_pbh(__A, __B), - (__v32hi)__W); -} - -/// Convert Two Packed Single Data to One Packed BF16 Data. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCVTNE2PS2BF16 instructions. -/// -/// \param __A -/// A 512-bit vector of [16 x float]. -/// \param __B -/// A 512-bit vector of [16 x float]. -/// \param __U -/// A 32-bit mask value specifying what is chosen for each element. -/// A 1 means conversion of __A or __B. A 0 means element is zero. -/// \returns A 512-bit vector of [32 x bfloat] whose lower 256 bits come from -/// conversion of __B, and higher 256 bits come from conversion of __A. -static __inline__ __m512bh __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtne2ps_pbh(__mmask32 __U, __m512 __A, __m512 __B) { - return (__m512bh)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_cvtne2ps_pbh(__A, __B), - (__v32hi)_mm512_setzero_si512()); -} - -/// Convert Packed Single Data to Packed BF16 Data. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCVTNEPS2BF16 instructions. -/// -/// \param __A -/// A 512-bit vector of [16 x float]. -/// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A. -static __inline__ __m256bh __DEFAULT_FN_ATTRS512 -_mm512_cvtneps_pbh(__m512 __A) { - return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A, - (__v16hi)_mm256_undefined_si256(), - (__mmask16)-1); -} - -/// Convert Packed Single Data to Packed BF16 Data. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCVTNEPS2BF16 instructions. -/// -/// \param __A -/// A 512-bit vector of [16 x float]. -/// \param __W -/// A 256-bit vector of [16 x bfloat]. -/// \param __U -/// A 16-bit mask value specifying what is chosen for each element. -/// A 1 means conversion of __A. A 0 means element from __W. -/// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A. -static __inline__ __m256bh __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtneps_pbh(__m256bh __W, __mmask16 __U, __m512 __A) { - return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A, - (__v16hi)__W, - (__mmask16)__U); -} - -/// Convert Packed Single Data to Packed BF16 Data. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCVTNEPS2BF16 instructions. -/// -/// \param __A -/// A 512-bit vector of [16 x float]. -/// \param __U -/// A 16-bit mask value specifying what is chosen for each element. -/// A 1 means conversion of __A. A 0 means element is zero. -/// \returns A 256-bit vector of [16 x bfloat] come from conversion of __A. -static __inline__ __m256bh __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtneps_pbh(__mmask16 __U, __m512 __A) { - return (__m256bh)__builtin_ia32_cvtneps2bf16_512_mask((__v16sf)__A, - (__v16hi)_mm256_setzero_si256(), - (__mmask16)__U); -} - -/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VDPBF16PS instructions. -/// -/// \param __A -/// A 512-bit vector of [32 x bfloat]. -/// \param __B -/// A 512-bit vector of [32 x bfloat]. -/// \param __D -/// A 512-bit vector of [16 x float]. -/// \returns A 512-bit vector of [16 x float] comes from Dot Product of -/// __A, __B and __D -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_dpbf16_ps(__m512 __D, __m512bh __A, __m512bh __B) { - return (__m512)__builtin_ia32_dpbf16ps_512((__v16sf) __D, - (__v16si) __A, - (__v16si) __B); -} - -/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VDPBF16PS instructions. -/// -/// \param __A -/// A 512-bit vector of [32 x bfloat]. -/// \param __B -/// A 512-bit vector of [32 x bfloat]. -/// \param __D -/// A 512-bit vector of [16 x float]. -/// \param __U -/// A 16-bit mask value specifying what is chosen for each element. -/// A 1 means __A and __B's dot product accumulated with __D. A 0 means __D. -/// \returns A 512-bit vector of [16 x float] comes from Dot Product of -/// __A, __B and __D -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_dpbf16_ps(__m512 __D, __mmask16 __U, __m512bh __A, __m512bh __B) { - return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_dpbf16_ps(__D, __A, __B), - (__v16sf)__D); -} - -/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VDPBF16PS instructions. -/// -/// \param __A -/// A 512-bit vector of [32 x bfloat]. -/// \param __B -/// A 512-bit vector of [32 x bfloat]. -/// \param __D -/// A 512-bit vector of [16 x float]. -/// \param __U -/// A 16-bit mask value specifying what is chosen for each element. -/// A 1 means __A and __B's dot product accumulated with __D. A 0 means 0. -/// \returns A 512-bit vector of [16 x float] comes from Dot Product of -/// __A, __B and __D -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_maskz_dpbf16_ps(__mmask16 __U, __m512 __D, __m512bh __A, __m512bh __B) { - return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_dpbf16_ps(__D, __A, __B), - (__v16sf)_mm512_setzero_si512()); -} - -/// Convert Packed BF16 Data to Packed float Data. -/// -/// \headerfile -/// -/// \param __A -/// A 256-bit vector of [16 x bfloat]. -/// \returns A 512-bit vector of [16 x float] come from conversion of __A -static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtpbh_ps(__m256bh __A) { - return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32( - (__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16)); -} - -/// Convert Packed BF16 Data to Packed float Data using zeroing mask. -/// -/// \headerfile -/// -/// \param __U -/// A 16-bit mask. Elements are zeroed out when the corresponding mask -/// bit is not set. -/// \param __A -/// A 256-bit vector of [16 x bfloat]. -/// \returns A 512-bit vector of [16 x float] come from conversion of __A -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtpbh_ps(__mmask16 __U, __m256bh __A) { - return _mm512_castsi512_ps((__m512i)_mm512_slli_epi32( - (__m512i)_mm512_maskz_cvtepi16_epi32((__mmask16)__U, (__m256i)__A), 16)); -} - -/// Convert Packed BF16 Data to Packed float Data using merging mask. -/// -/// \headerfile -/// -/// \param __S -/// A 512-bit vector of [16 x float]. Elements are copied from __S when -/// the corresponding mask bit is not set. -/// \param __U -/// A 16-bit mask. -/// \param __A -/// A 256-bit vector of [16 x bfloat]. -/// \returns A 512-bit vector of [16 x float] come from conversion of __A -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtpbh_ps(__m512 __S, __mmask16 __U, __m256bh __A) { - return _mm512_castsi512_ps((__m512i)_mm512_mask_slli_epi32( - (__m512i)__S, (__mmask16)__U, - (__m512i)_mm512_cvtepi16_epi32((__m256i)__A), 16)); -} - -#undef __DEFAULT_FN_ATTRS -#undef __DEFAULT_FN_ATTRS512 - -#endif diff --git a/include/avx512bitalgintrin.h b/include/avx512bitalgintrin.h deleted file mode 100644 index d4411d1..0000000 --- a/include/avx512bitalgintrin.h +++ /dev/null @@ -1,83 +0,0 @@ -/*===------------- avx512bitalgintrin.h - BITALG intrinsics ------------------=== - * - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ -#ifndef __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __AVX512BITALGINTRIN_H -#define __AVX512BITALGINTRIN_H - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512bitalg"), __min_vector_width__(512))) - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_popcnt_epi16(__m512i __A) -{ - return (__m512i) __builtin_ia32_vpopcntw_512((__v32hi) __A); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_popcnt_epi16(__m512i __A, __mmask32 __U, __m512i __B) -{ - return (__m512i) __builtin_ia32_selectw_512((__mmask32) __U, - (__v32hi) _mm512_popcnt_epi16(__B), - (__v32hi) __A); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_popcnt_epi16(__mmask32 __U, __m512i __B) -{ - return _mm512_mask_popcnt_epi16((__m512i) _mm512_setzero_si512(), - __U, - __B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_popcnt_epi8(__m512i __A) -{ - return (__m512i) __builtin_ia32_vpopcntb_512((__v64qi) __A); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_popcnt_epi8(__m512i __A, __mmask64 __U, __m512i __B) -{ - return (__m512i) __builtin_ia32_selectb_512((__mmask64) __U, - (__v64qi) _mm512_popcnt_epi8(__B), - (__v64qi) __A); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_popcnt_epi8(__mmask64 __U, __m512i __B) -{ - return _mm512_mask_popcnt_epi8((__m512i) _mm512_setzero_si512(), - __U, - __B); -} - -static __inline__ __mmask64 __DEFAULT_FN_ATTRS -_mm512_mask_bitshuffle_epi64_mask(__mmask64 __U, __m512i __A, __m512i __B) -{ - return (__mmask64) __builtin_ia32_vpshufbitqmb512_mask((__v64qi) __A, - (__v64qi) __B, - __U); -} - -static __inline__ __mmask64 __DEFAULT_FN_ATTRS -_mm512_bitshuffle_epi64_mask(__m512i __A, __m512i __B) -{ - return _mm512_mask_bitshuffle_epi64_mask((__mmask64) -1, - __A, - __B); -} - - -#undef __DEFAULT_FN_ATTRS - -#endif diff --git a/include/avx512bwintrin.h b/include/avx512bwintrin.h deleted file mode 100644 index 717b92b..0000000 --- a/include/avx512bwintrin.h +++ /dev/null @@ -1,2104 +0,0 @@ -/*===------------- avx512bwintrin.h - AVX512BW intrinsics ------------------=== - * - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ -#ifndef __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __AVX512BWINTRIN_H -#define __AVX512BWINTRIN_H - -typedef unsigned int __mmask32; -typedef unsigned long long __mmask64; - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS512 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw"), __min_vector_width__(512))) -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512bw"))) - -static __inline __mmask32 __DEFAULT_FN_ATTRS -_knot_mask32(__mmask32 __M) -{ - return __builtin_ia32_knotsi(__M); -} - -static __inline __mmask64 __DEFAULT_FN_ATTRS -_knot_mask64(__mmask64 __M) -{ - return __builtin_ia32_knotdi(__M); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_kand_mask32(__mmask32 __A, __mmask32 __B) -{ - return (__mmask32)__builtin_ia32_kandsi((__mmask32)__A, (__mmask32)__B); -} - -static __inline__ __mmask64 __DEFAULT_FN_ATTRS -_kand_mask64(__mmask64 __A, __mmask64 __B) -{ - return (__mmask64)__builtin_ia32_kanddi((__mmask64)__A, (__mmask64)__B); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_kandn_mask32(__mmask32 __A, __mmask32 __B) -{ - return (__mmask32)__builtin_ia32_kandnsi((__mmask32)__A, (__mmask32)__B); -} - -static __inline__ __mmask64 __DEFAULT_FN_ATTRS -_kandn_mask64(__mmask64 __A, __mmask64 __B) -{ - return (__mmask64)__builtin_ia32_kandndi((__mmask64)__A, (__mmask64)__B); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_kor_mask32(__mmask32 __A, __mmask32 __B) -{ - return (__mmask32)__builtin_ia32_korsi((__mmask32)__A, (__mmask32)__B); -} - -static __inline__ __mmask64 __DEFAULT_FN_ATTRS -_kor_mask64(__mmask64 __A, __mmask64 __B) -{ - return (__mmask64)__builtin_ia32_kordi((__mmask64)__A, (__mmask64)__B); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_kxnor_mask32(__mmask32 __A, __mmask32 __B) -{ - return (__mmask32)__builtin_ia32_kxnorsi((__mmask32)__A, (__mmask32)__B); -} - -static __inline__ __mmask64 __DEFAULT_FN_ATTRS -_kxnor_mask64(__mmask64 __A, __mmask64 __B) -{ - return (__mmask64)__builtin_ia32_kxnordi((__mmask64)__A, (__mmask64)__B); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_kxor_mask32(__mmask32 __A, __mmask32 __B) -{ - return (__mmask32)__builtin_ia32_kxorsi((__mmask32)__A, (__mmask32)__B); -} - -static __inline__ __mmask64 __DEFAULT_FN_ATTRS -_kxor_mask64(__mmask64 __A, __mmask64 __B) -{ - return (__mmask64)__builtin_ia32_kxordi((__mmask64)__A, (__mmask64)__B); -} - -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_kortestc_mask32_u8(__mmask32 __A, __mmask32 __B) -{ - return (unsigned char)__builtin_ia32_kortestcsi(__A, __B); -} - -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_kortestz_mask32_u8(__mmask32 __A, __mmask32 __B) -{ - return (unsigned char)__builtin_ia32_kortestzsi(__A, __B); -} - -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_kortest_mask32_u8(__mmask32 __A, __mmask32 __B, unsigned char *__C) { - *__C = (unsigned char)__builtin_ia32_kortestcsi(__A, __B); - return (unsigned char)__builtin_ia32_kortestzsi(__A, __B); -} - -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_kortestc_mask64_u8(__mmask64 __A, __mmask64 __B) -{ - return (unsigned char)__builtin_ia32_kortestcdi(__A, __B); -} - -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_kortestz_mask64_u8(__mmask64 __A, __mmask64 __B) -{ - return (unsigned char)__builtin_ia32_kortestzdi(__A, __B); -} - -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_kortest_mask64_u8(__mmask64 __A, __mmask64 __B, unsigned char *__C) { - *__C = (unsigned char)__builtin_ia32_kortestcdi(__A, __B); - return (unsigned char)__builtin_ia32_kortestzdi(__A, __B); -} - -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_ktestc_mask32_u8(__mmask32 __A, __mmask32 __B) -{ - return (unsigned char)__builtin_ia32_ktestcsi(__A, __B); -} - -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_ktestz_mask32_u8(__mmask32 __A, __mmask32 __B) -{ - return (unsigned char)__builtin_ia32_ktestzsi(__A, __B); -} - -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_ktest_mask32_u8(__mmask32 __A, __mmask32 __B, unsigned char *__C) { - *__C = (unsigned char)__builtin_ia32_ktestcsi(__A, __B); - return (unsigned char)__builtin_ia32_ktestzsi(__A, __B); -} - -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_ktestc_mask64_u8(__mmask64 __A, __mmask64 __B) -{ - return (unsigned char)__builtin_ia32_ktestcdi(__A, __B); -} - -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_ktestz_mask64_u8(__mmask64 __A, __mmask64 __B) -{ - return (unsigned char)__builtin_ia32_ktestzdi(__A, __B); -} - -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_ktest_mask64_u8(__mmask64 __A, __mmask64 __B, unsigned char *__C) { - *__C = (unsigned char)__builtin_ia32_ktestcdi(__A, __B); - return (unsigned char)__builtin_ia32_ktestzdi(__A, __B); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_kadd_mask32(__mmask32 __A, __mmask32 __B) -{ - return (__mmask32)__builtin_ia32_kaddsi((__mmask32)__A, (__mmask32)__B); -} - -static __inline__ __mmask64 __DEFAULT_FN_ATTRS -_kadd_mask64(__mmask64 __A, __mmask64 __B) -{ - return (__mmask64)__builtin_ia32_kadddi((__mmask64)__A, (__mmask64)__B); -} - -#define _kshiftli_mask32(A, I) \ - ((__mmask32)__builtin_ia32_kshiftlisi((__mmask32)(A), (unsigned int)(I))) - -#define _kshiftri_mask32(A, I) \ - ((__mmask32)__builtin_ia32_kshiftrisi((__mmask32)(A), (unsigned int)(I))) - -#define _kshiftli_mask64(A, I) \ - ((__mmask64)__builtin_ia32_kshiftlidi((__mmask64)(A), (unsigned int)(I))) - -#define _kshiftri_mask64(A, I) \ - ((__mmask64)__builtin_ia32_kshiftridi((__mmask64)(A), (unsigned int)(I))) - -static __inline__ unsigned int __DEFAULT_FN_ATTRS -_cvtmask32_u32(__mmask32 __A) { - return (unsigned int)__builtin_ia32_kmovd((__mmask32)__A); -} - -static __inline__ unsigned long long __DEFAULT_FN_ATTRS -_cvtmask64_u64(__mmask64 __A) { - return (unsigned long long)__builtin_ia32_kmovq((__mmask64)__A); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_cvtu32_mask32(unsigned int __A) { - return (__mmask32)__builtin_ia32_kmovd((__mmask32)__A); -} - -static __inline__ __mmask64 __DEFAULT_FN_ATTRS -_cvtu64_mask64(unsigned long long __A) { - return (__mmask64)__builtin_ia32_kmovq((__mmask64)__A); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_load_mask32(__mmask32 *__A) { - return (__mmask32)__builtin_ia32_kmovd(*(__mmask32 *)__A); -} - -static __inline__ __mmask64 __DEFAULT_FN_ATTRS -_load_mask64(__mmask64 *__A) { - return (__mmask64)__builtin_ia32_kmovq(*(__mmask64 *)__A); -} - -static __inline__ void __DEFAULT_FN_ATTRS -_store_mask32(__mmask32 *__A, __mmask32 __B) { - *(__mmask32 *)__A = __builtin_ia32_kmovd((__mmask32)__B); -} - -static __inline__ void __DEFAULT_FN_ATTRS -_store_mask64(__mmask64 *__A, __mmask64 __B) { - *(__mmask64 *)__A = __builtin_ia32_kmovq((__mmask64)__B); -} - -/* Integer compare */ - -#define _mm512_cmp_epi8_mask(a, b, p) \ - ((__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \ - (__v64qi)(__m512i)(b), (int)(p), \ - (__mmask64)-1)) - -#define _mm512_mask_cmp_epi8_mask(m, a, b, p) \ - ((__mmask64)__builtin_ia32_cmpb512_mask((__v64qi)(__m512i)(a), \ - (__v64qi)(__m512i)(b), (int)(p), \ - (__mmask64)(m))) - -#define _mm512_cmp_epu8_mask(a, b, p) \ - ((__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \ - (__v64qi)(__m512i)(b), (int)(p), \ - (__mmask64)-1)) - -#define _mm512_mask_cmp_epu8_mask(m, a, b, p) \ - ((__mmask64)__builtin_ia32_ucmpb512_mask((__v64qi)(__m512i)(a), \ - (__v64qi)(__m512i)(b), (int)(p), \ - (__mmask64)(m))) - -#define _mm512_cmp_epi16_mask(a, b, p) \ - ((__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \ - (__v32hi)(__m512i)(b), (int)(p), \ - (__mmask32)-1)) - -#define _mm512_mask_cmp_epi16_mask(m, a, b, p) \ - ((__mmask32)__builtin_ia32_cmpw512_mask((__v32hi)(__m512i)(a), \ - (__v32hi)(__m512i)(b), (int)(p), \ - (__mmask32)(m))) - -#define _mm512_cmp_epu16_mask(a, b, p) \ - ((__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \ - (__v32hi)(__m512i)(b), (int)(p), \ - (__mmask32)-1)) - -#define _mm512_mask_cmp_epu16_mask(m, a, b, p) \ - ((__mmask32)__builtin_ia32_ucmpw512_mask((__v32hi)(__m512i)(a), \ - (__v32hi)(__m512i)(b), (int)(p), \ - (__mmask32)(m))) - -#define _mm512_cmpeq_epi8_mask(A, B) \ - _mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_EQ) -#define _mm512_mask_cmpeq_epi8_mask(k, A, B) \ - _mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_EQ) -#define _mm512_cmpge_epi8_mask(A, B) \ - _mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_GE) -#define _mm512_mask_cmpge_epi8_mask(k, A, B) \ - _mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_GE) -#define _mm512_cmpgt_epi8_mask(A, B) \ - _mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_GT) -#define _mm512_mask_cmpgt_epi8_mask(k, A, B) \ - _mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_GT) -#define _mm512_cmple_epi8_mask(A, B) \ - _mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_LE) -#define _mm512_mask_cmple_epi8_mask(k, A, B) \ - _mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_LE) -#define _mm512_cmplt_epi8_mask(A, B) \ - _mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_LT) -#define _mm512_mask_cmplt_epi8_mask(k, A, B) \ - _mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_LT) -#define _mm512_cmpneq_epi8_mask(A, B) \ - _mm512_cmp_epi8_mask((A), (B), _MM_CMPINT_NE) -#define _mm512_mask_cmpneq_epi8_mask(k, A, B) \ - _mm512_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_NE) - -#define _mm512_cmpeq_epu8_mask(A, B) \ - _mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_EQ) -#define _mm512_mask_cmpeq_epu8_mask(k, A, B) \ - _mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_EQ) -#define _mm512_cmpge_epu8_mask(A, B) \ - _mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_GE) -#define _mm512_mask_cmpge_epu8_mask(k, A, B) \ - _mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_GE) -#define _mm512_cmpgt_epu8_mask(A, B) \ - _mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_GT) -#define _mm512_mask_cmpgt_epu8_mask(k, A, B) \ - _mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_GT) -#define _mm512_cmple_epu8_mask(A, B) \ - _mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_LE) -#define _mm512_mask_cmple_epu8_mask(k, A, B) \ - _mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_LE) -#define _mm512_cmplt_epu8_mask(A, B) \ - _mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_LT) -#define _mm512_mask_cmplt_epu8_mask(k, A, B) \ - _mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_LT) -#define _mm512_cmpneq_epu8_mask(A, B) \ - _mm512_cmp_epu8_mask((A), (B), _MM_CMPINT_NE) -#define _mm512_mask_cmpneq_epu8_mask(k, A, B) \ - _mm512_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_NE) - -#define _mm512_cmpeq_epi16_mask(A, B) \ - _mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_EQ) -#define _mm512_mask_cmpeq_epi16_mask(k, A, B) \ - _mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_EQ) -#define _mm512_cmpge_epi16_mask(A, B) \ - _mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_GE) -#define _mm512_mask_cmpge_epi16_mask(k, A, B) \ - _mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_GE) -#define _mm512_cmpgt_epi16_mask(A, B) \ - _mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_GT) -#define _mm512_mask_cmpgt_epi16_mask(k, A, B) \ - _mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_GT) -#define _mm512_cmple_epi16_mask(A, B) \ - _mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_LE) -#define _mm512_mask_cmple_epi16_mask(k, A, B) \ - _mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_LE) -#define _mm512_cmplt_epi16_mask(A, B) \ - _mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_LT) -#define _mm512_mask_cmplt_epi16_mask(k, A, B) \ - _mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_LT) -#define _mm512_cmpneq_epi16_mask(A, B) \ - _mm512_cmp_epi16_mask((A), (B), _MM_CMPINT_NE) -#define _mm512_mask_cmpneq_epi16_mask(k, A, B) \ - _mm512_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_NE) - -#define _mm512_cmpeq_epu16_mask(A, B) \ - _mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_EQ) -#define _mm512_mask_cmpeq_epu16_mask(k, A, B) \ - _mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_EQ) -#define _mm512_cmpge_epu16_mask(A, B) \ - _mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_GE) -#define _mm512_mask_cmpge_epu16_mask(k, A, B) \ - _mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_GE) -#define _mm512_cmpgt_epu16_mask(A, B) \ - _mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_GT) -#define _mm512_mask_cmpgt_epu16_mask(k, A, B) \ - _mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_GT) -#define _mm512_cmple_epu16_mask(A, B) \ - _mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_LE) -#define _mm512_mask_cmple_epu16_mask(k, A, B) \ - _mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_LE) -#define _mm512_cmplt_epu16_mask(A, B) \ - _mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_LT) -#define _mm512_mask_cmplt_epu16_mask(k, A, B) \ - _mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_LT) -#define _mm512_cmpneq_epu16_mask(A, B) \ - _mm512_cmp_epu16_mask((A), (B), _MM_CMPINT_NE) -#define _mm512_mask_cmpneq_epu16_mask(k, A, B) \ - _mm512_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_NE) - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_add_epi8 (__m512i __A, __m512i __B) { - return (__m512i) ((__v64qu) __A + (__v64qu) __B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_add_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, - (__v64qi)_mm512_add_epi8(__A, __B), - (__v64qi)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_add_epi8(__mmask64 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, - (__v64qi)_mm512_add_epi8(__A, __B), - (__v64qi)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_sub_epi8 (__m512i __A, __m512i __B) { - return (__m512i) ((__v64qu) __A - (__v64qu) __B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_sub_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, - (__v64qi)_mm512_sub_epi8(__A, __B), - (__v64qi)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_sub_epi8(__mmask64 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, - (__v64qi)_mm512_sub_epi8(__A, __B), - (__v64qi)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_add_epi16 (__m512i __A, __m512i __B) { - return (__m512i) ((__v32hu) __A + (__v32hu) __B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_add_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_add_epi16(__A, __B), - (__v32hi)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_add_epi16(__mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_add_epi16(__A, __B), - (__v32hi)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_sub_epi16 (__m512i __A, __m512i __B) { - return (__m512i) ((__v32hu) __A - (__v32hu) __B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_sub_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_sub_epi16(__A, __B), - (__v32hi)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_sub_epi16(__mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_sub_epi16(__A, __B), - (__v32hi)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mullo_epi16 (__m512i __A, __m512i __B) { - return (__m512i) ((__v32hu) __A * (__v32hu) __B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_mullo_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_mullo_epi16(__A, __B), - (__v32hi)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_mullo_epi16(__mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_mullo_epi16(__A, __B), - (__v32hi)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_blend_epi8 (__mmask64 __U, __m512i __A, __m512i __W) -{ - return (__m512i) __builtin_ia32_selectb_512 ((__mmask64) __U, - (__v64qi) __W, - (__v64qi) __A); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_blend_epi16 (__mmask32 __U, __m512i __A, __m512i __W) -{ - return (__m512i) __builtin_ia32_selectw_512 ((__mmask32) __U, - (__v32hi) __W, - (__v32hi) __A); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_abs_epi8 (__m512i __A) -{ -#if (__clang_major__ < 14) - return (__m512i)__builtin_ia32_pabsb512((__v64qi)__A); -#else - return (__m512i)__builtin_elementwise_abs((__v64qs)__A); -#endif -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_abs_epi8 (__m512i __W, __mmask64 __U, __m512i __A) -{ - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, - (__v64qi)_mm512_abs_epi8(__A), - (__v64qi)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_abs_epi8 (__mmask64 __U, __m512i __A) -{ - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, - (__v64qi)_mm512_abs_epi8(__A), - (__v64qi)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_abs_epi16 (__m512i __A) -{ -#if (__clang_major__ < 14) - return (__m512i)__builtin_ia32_pabsw512((__v32hi)__A); -#else - return (__m512i)__builtin_elementwise_abs((__v32hi)__A); -#endif -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_abs_epi16 (__m512i __W, __mmask32 __U, __m512i __A) -{ - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_abs_epi16(__A), - (__v32hi)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_abs_epi16 (__mmask32 __U, __m512i __A) -{ - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_abs_epi16(__A), - (__v32hi)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_packs_epi32(__m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_packssdw512((__v16si)__A, (__v16si)__B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_packs_epi32(__mmask32 __M, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, - (__v32hi)_mm512_packs_epi32(__A, __B), - (__v32hi)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_packs_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, - (__v32hi)_mm512_packs_epi32(__A, __B), - (__v32hi)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_packs_epi16(__m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_packsswb512((__v32hi)__A, (__v32hi) __B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_packs_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, - (__v64qi)_mm512_packs_epi16(__A, __B), - (__v64qi)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_packs_epi16(__mmask64 __M, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, - (__v64qi)_mm512_packs_epi16(__A, __B), - (__v64qi)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_packus_epi32(__m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_packusdw512((__v16si) __A, (__v16si) __B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_packus_epi32(__mmask32 __M, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, - (__v32hi)_mm512_packus_epi32(__A, __B), - (__v32hi)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_packus_epi32(__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, - (__v32hi)_mm512_packus_epi32(__A, __B), - (__v32hi)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_packus_epi16(__m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_packuswb512((__v32hi) __A, (__v32hi) __B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_packus_epi16(__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, - (__v64qi)_mm512_packus_epi16(__A, __B), - (__v64qi)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_packus_epi16(__mmask64 __M, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, - (__v64qi)_mm512_packus_epi16(__A, __B), - (__v64qi)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_adds_epi8 (__m512i __A, __m512i __B) -{ -#if (__clang_major__ > 14) - return (__m512i)__builtin_elementwise_add_sat((__v64qs)__A, (__v64qs)__B); -#else - return (__m512i)__builtin_ia32_paddsb512((__v64qi)__A, (__v64qi)__B); -#endif -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_adds_epi8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, - (__v64qi)_mm512_adds_epi8(__A, __B), - (__v64qi)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_adds_epi8 (__mmask64 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, - (__v64qi)_mm512_adds_epi8(__A, __B), - (__v64qi)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_adds_epi16 (__m512i __A, __m512i __B) -{ -#if (__clang_major__ > 14) - return (__m512i)__builtin_elementwise_add_sat((__v32hi)__A, (__v32hi)__B); -#else - return (__m512i)__builtin_ia32_paddsw512((__v32hi)__A, (__v32hi)__B); -#endif -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_adds_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_adds_epi16(__A, __B), - (__v32hi)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_adds_epi16 (__mmask32 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_adds_epi16(__A, __B), - (__v32hi)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_adds_epu8 (__m512i __A, __m512i __B) -{ -#if (__clang_major__ > 14) - return (__m512i)__builtin_elementwise_add_sat((__v64qu) __A, (__v64qu) __B); -#else - return (__m512i)__builtin_ia32_paddusb512((__v64qi) __A, (__v64qi) __B); -#endif -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_adds_epu8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, - (__v64qi)_mm512_adds_epu8(__A, __B), - (__v64qi)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_adds_epu8 (__mmask64 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, - (__v64qi)_mm512_adds_epu8(__A, __B), - (__v64qi)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_adds_epu16 (__m512i __A, __m512i __B) -{ -#if (__clang_major__ > 14) - return (__m512i)__builtin_elementwise_add_sat((__v32hu) __A, (__v32hu) __B); -#else - return (__m512i)__builtin_ia32_paddusw512((__v32hi) __A, (__v32hi) __B); -#endif -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_adds_epu16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_adds_epu16(__A, __B), - (__v32hi)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_adds_epu16 (__mmask32 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_adds_epu16(__A, __B), - (__v32hi)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_avg_epu8 (__m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_pavgb512((__v64qi)__A, (__v64qi)__B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_avg_epu8 (__m512i __W, __mmask64 __U, __m512i __A, - __m512i __B) -{ - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, - (__v64qi)_mm512_avg_epu8(__A, __B), - (__v64qi)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_avg_epu8 (__mmask64 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, - (__v64qi)_mm512_avg_epu8(__A, __B), - (__v64qi)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_avg_epu16 (__m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_pavgw512((__v32hi)__A, (__v32hi)__B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_avg_epu16 (__m512i __W, __mmask32 __U, __m512i __A, - __m512i __B) -{ - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_avg_epu16(__A, __B), - (__v32hi)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_avg_epu16 (__mmask32 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_avg_epu16(__A, __B), - (__v32hi) _mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_max_epi8 (__m512i __A, __m512i __B) -{ -#if (__clang_major__ < 14) - return (__m512i)__builtin_ia32_pmaxsb512((__v64qi) __A, (__v64qi) __B); -#else - return (__m512i)__builtin_elementwise_max((__v64qs) __A, (__v64qs) __B); -#endif -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_max_epi8 (__mmask64 __M, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, - (__v64qi)_mm512_max_epi8(__A, __B), - (__v64qi)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_max_epi8 (__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, - (__v64qi)_mm512_max_epi8(__A, __B), - (__v64qi)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_max_epi16 (__m512i __A, __m512i __B) -{ -#if (__clang_major__ < 14) - return (__m512i)__builtin_ia32_pmaxsw512((__v32hi) __A, (__v32hi) __B); -#else - return (__m512i)__builtin_elementwise_max((__v32hi) __A, (__v32hi) __B); -#endif -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_max_epi16 (__mmask32 __M, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, - (__v32hi)_mm512_max_epi16(__A, __B), - (__v32hi)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_max_epi16 (__m512i __W, __mmask32 __M, __m512i __A, - __m512i __B) -{ - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, - (__v32hi)_mm512_max_epi16(__A, __B), - (__v32hi)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_max_epu8 (__m512i __A, __m512i __B) -{ -#if (__clang_major__ < 14) - return (__m512i)__builtin_ia32_pmaxub512((__v64qi)__A, (__v64qi)__B); -#else - return (__m512i)__builtin_elementwise_max((__v64qu)__A, (__v64qu)__B); -#endif -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_max_epu8 (__mmask64 __M, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, - (__v64qi)_mm512_max_epu8(__A, __B), - (__v64qi)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_max_epu8 (__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, - (__v64qi)_mm512_max_epu8(__A, __B), - (__v64qi)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_max_epu16 (__m512i __A, __m512i __B) -{ -#if (__clang_major__ < 14) - return (__m512i)__builtin_ia32_pmaxuw512((__v32hi)__A, (__v32hi)__B); -#else - return (__m512i)__builtin_elementwise_max((__v32hu)__A, (__v32hu)__B); -#endif -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_max_epu16 (__mmask32 __M, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, - (__v32hi)_mm512_max_epu16(__A, __B), - (__v32hi)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_max_epu16 (__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, - (__v32hi)_mm512_max_epu16(__A, __B), - (__v32hi)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_min_epi8 (__m512i __A, __m512i __B) -{ -#if (__clang_major__ < 14) - return (__m512i)__builtin_ia32_pminsb512((__v64qi) __A, (__v64qi) __B); -#else - return (__m512i)__builtin_elementwise_min((__v64qs) __A, (__v64qs) __B); -#endif -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_min_epi8 (__mmask64 __M, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, - (__v64qi)_mm512_min_epi8(__A, __B), - (__v64qi)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_min_epi8 (__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, - (__v64qi)_mm512_min_epi8(__A, __B), - (__v64qi)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_min_epi16 (__m512i __A, __m512i __B) -{ -#if (__clang_major__ < 14) - return (__m512i)__builtin_ia32_pminsw512((__v32hi) __A, (__v32hi) __B); -#else - return (__m512i)__builtin_elementwise_min((__v32hi) __A, (__v32hi) __B); -#endif -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_min_epi16 (__mmask32 __M, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, - (__v32hi)_mm512_min_epi16(__A, __B), - (__v32hi)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_min_epi16 (__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, - (__v32hi)_mm512_min_epi16(__A, __B), - (__v32hi)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_min_epu8 (__m512i __A, __m512i __B) -{ -#if (__clang_major__ < 14) - return (__m512i)__builtin_ia32_pminub512((__v64qi)__A, (__v64qi)__B); -#else - return (__m512i)__builtin_elementwise_min((__v64qu)__A, (__v64qu)__B); -#endif -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_min_epu8 (__mmask64 __M, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, - (__v64qi)_mm512_min_epu8(__A, __B), - (__v64qi)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_min_epu8 (__m512i __W, __mmask64 __M, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, - (__v64qi)_mm512_min_epu8(__A, __B), - (__v64qi)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_min_epu16 (__m512i __A, __m512i __B) -{ -#if (__clang_major__ < 14) - return (__m512i)__builtin_ia32_pminuw512((__v32hi)__A, (__v32hi)__B); -#else - return (__m512i)__builtin_elementwise_min((__v32hu)__A, (__v32hu)__B); -#endif -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_min_epu16 (__mmask32 __M, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, - (__v32hi)_mm512_min_epu16(__A, __B), - (__v32hi)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_min_epu16 (__m512i __W, __mmask32 __M, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, - (__v32hi)_mm512_min_epu16(__A, __B), - (__v32hi)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_shuffle_epi8(__m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_pshufb512((__v64qi)__A,(__v64qi)__B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_shuffle_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, - (__v64qi)_mm512_shuffle_epi8(__A, __B), - (__v64qi)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_shuffle_epi8(__mmask64 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, - (__v64qi)_mm512_shuffle_epi8(__A, __B), - (__v64qi)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_subs_epi8 (__m512i __A, __m512i __B) -{ -#if (__clang_major__ > 14) - return (__m512i)__builtin_elementwise_sub_sat((__v64qs)__A, (__v64qs)__B); -#else - return (__m512i)__builtin_ia32_psubsb512((__v64qi)__A, (__v64qi)__B); -#endif -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_subs_epi8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, - (__v64qi)_mm512_subs_epi8(__A, __B), - (__v64qi)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_subs_epi8 (__mmask64 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, - (__v64qi)_mm512_subs_epi8(__A, __B), - (__v64qi)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_subs_epi16 (__m512i __A, __m512i __B) -{ -#if (__clang_major__ > 14) - return (__m512i)__builtin_elementwise_sub_sat((__v32hi)__A, (__v32hi)__B); -#else - return (__m512i)__builtin_ia32_psubsw512((__v32hi)__A, (__v32hi)__B); -#endif -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_subs_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_subs_epi16(__A, __B), - (__v32hi)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_subs_epi16 (__mmask32 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_subs_epi16(__A, __B), - (__v32hi)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_subs_epu8 (__m512i __A, __m512i __B) -{ -#if (__clang_major__ > 14) - return (__m512i)__builtin_elementwise_sub_sat((__v64qu) __A, (__v64qu) __B); -#else - return (__m512i)__builtin_ia32_psubusb512((__v64qi) __A, (__v64qi) __B); -#endif -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_subs_epu8 (__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, - (__v64qi)_mm512_subs_epu8(__A, __B), - (__v64qi)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_subs_epu8 (__mmask64 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, - (__v64qi)_mm512_subs_epu8(__A, __B), - (__v64qi)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_subs_epu16 (__m512i __A, __m512i __B) -{ -#if (__clang_major__ > 14) - return (__m512i)__builtin_elementwise_sub_sat((__v32hu) __A, (__v32hu) __B); -#else - return (__m512i)__builtin_ia32_psubusw512((__v32hi) __A, (__v32hi) __B); -#endif -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_subs_epu16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_subs_epu16(__A, __B), - (__v32hi)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_subs_epu16 (__mmask32 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_subs_epu16(__A, __B), - (__v32hi)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_permutex2var_epi16(__m512i __A, __m512i __I, __m512i __B) -{ - return (__m512i)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I, - (__v32hi)__B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_permutex2var_epi16(__m512i __A, __mmask32 __U, __m512i __I, - __m512i __B) -{ - return (__m512i)__builtin_ia32_selectw_512(__U, - (__v32hi)_mm512_permutex2var_epi16(__A, __I, __B), - (__v32hi)__A); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask2_permutex2var_epi16(__m512i __A, __m512i __I, __mmask32 __U, - __m512i __B) -{ - return (__m512i)__builtin_ia32_selectw_512(__U, - (__v32hi)_mm512_permutex2var_epi16(__A, __I, __B), - (__v32hi)__I); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_permutex2var_epi16(__mmask32 __U, __m512i __A, __m512i __I, - __m512i __B) -{ - return (__m512i)__builtin_ia32_selectw_512(__U, - (__v32hi)_mm512_permutex2var_epi16(__A, __I, __B), - (__v32hi)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mulhrs_epi16(__m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_pmulhrsw512((__v32hi)__A, (__v32hi)__B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_mulhrs_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_mulhrs_epi16(__A, __B), - (__v32hi)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_mulhrs_epi16(__mmask32 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_mulhrs_epi16(__A, __B), - (__v32hi)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mulhi_epi16(__m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_pmulhw512((__v32hi) __A, (__v32hi) __B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_mulhi_epi16(__m512i __W, __mmask32 __U, __m512i __A, - __m512i __B) -{ - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_mulhi_epi16(__A, __B), - (__v32hi)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_mulhi_epi16(__mmask32 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_mulhi_epi16(__A, __B), - (__v32hi)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mulhi_epu16(__m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_pmulhuw512((__v32hi) __A, (__v32hi) __B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_mulhi_epu16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_mulhi_epu16(__A, __B), - (__v32hi)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_mulhi_epu16 (__mmask32 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_mulhi_epu16(__A, __B), - (__v32hi)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maddubs_epi16(__m512i __X, __m512i __Y) { - return (__m512i)__builtin_ia32_pmaddubsw512((__v64qi)__X, (__v64qi)__Y); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_maddubs_epi16(__m512i __W, __mmask32 __U, __m512i __X, - __m512i __Y) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32) __U, - (__v32hi)_mm512_maddubs_epi16(__X, __Y), - (__v32hi)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_maddubs_epi16(__mmask32 __U, __m512i __X, __m512i __Y) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32) __U, - (__v32hi)_mm512_maddubs_epi16(__X, __Y), - (__v32hi)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_madd_epi16(__m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_pmaddwd512((__v32hi)__A, (__v32hi)__B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_madd_epi16(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_madd_epi16(__A, __B), - (__v16si)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_madd_epi16(__mmask16 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_madd_epi16(__A, __B), - (__v16si)_mm512_setzero_si512()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS512 -_mm512_cvtsepi16_epi8 (__m512i __A) { - return (__m256i) __builtin_ia32_pmovswb512_mask ((__v32hi) __A, - (__v32qi)_mm256_setzero_si256(), - (__mmask32) -1); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtsepi16_epi8 (__m256i __O, __mmask32 __M, __m512i __A) { - return (__m256i) __builtin_ia32_pmovswb512_mask ((__v32hi) __A, - (__v32qi)__O, - __M); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtsepi16_epi8 (__mmask32 __M, __m512i __A) { - return (__m256i) __builtin_ia32_pmovswb512_mask ((__v32hi) __A, - (__v32qi) _mm256_setzero_si256(), - __M); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS512 -_mm512_cvtusepi16_epi8 (__m512i __A) { - return (__m256i) __builtin_ia32_pmovuswb512_mask ((__v32hi) __A, - (__v32qi) _mm256_setzero_si256(), - (__mmask32) -1); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtusepi16_epi8 (__m256i __O, __mmask32 __M, __m512i __A) { - return (__m256i) __builtin_ia32_pmovuswb512_mask ((__v32hi) __A, - (__v32qi) __O, - __M); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtusepi16_epi8 (__mmask32 __M, __m512i __A) { - return (__m256i) __builtin_ia32_pmovuswb512_mask ((__v32hi) __A, - (__v32qi) _mm256_setzero_si256(), - __M); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS512 -_mm512_cvtepi16_epi8 (__m512i __A) { - return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A, - (__v32qi) _mm256_undefined_si256(), - (__mmask32) -1); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtepi16_epi8 (__m256i __O, __mmask32 __M, __m512i __A) { - return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A, - (__v32qi) __O, - __M); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtepi16_epi8 (__mmask32 __M, __m512i __A) { - return (__m256i) __builtin_ia32_pmovwb512_mask ((__v32hi) __A, - (__v32qi) _mm256_setzero_si256(), - __M); -} - -static __inline__ void __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A) -{ - __builtin_ia32_pmovwb512mem_mask ((__v32qi *) __P, (__v32hi) __A, __M); -} - -static __inline__ void __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A) -{ - __builtin_ia32_pmovswb512mem_mask ((__v32qi *) __P, (__v32hi) __A, __M); -} - -static __inline__ void __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask32 __M, __m512i __A) -{ - __builtin_ia32_pmovuswb512mem_mask ((__v32qi *) __P, (__v32hi) __A, __M); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_unpackhi_epi8(__m512i __A, __m512i __B) { - return (__m512i)__builtin_shufflevector((__v64qi)__A, (__v64qi)__B, - 8, 64+8, 9, 64+9, - 10, 64+10, 11, 64+11, - 12, 64+12, 13, 64+13, - 14, 64+14, 15, 64+15, - 24, 64+24, 25, 64+25, - 26, 64+26, 27, 64+27, - 28, 64+28, 29, 64+29, - 30, 64+30, 31, 64+31, - 40, 64+40, 41, 64+41, - 42, 64+42, 43, 64+43, - 44, 64+44, 45, 64+45, - 46, 64+46, 47, 64+47, - 56, 64+56, 57, 64+57, - 58, 64+58, 59, 64+59, - 60, 64+60, 61, 64+61, - 62, 64+62, 63, 64+63); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_unpackhi_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, - (__v64qi)_mm512_unpackhi_epi8(__A, __B), - (__v64qi)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_unpackhi_epi8(__mmask64 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, - (__v64qi)_mm512_unpackhi_epi8(__A, __B), - (__v64qi)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_unpackhi_epi16(__m512i __A, __m512i __B) { - return (__m512i)__builtin_shufflevector((__v32hi)__A, (__v32hi)__B, - 4, 32+4, 5, 32+5, - 6, 32+6, 7, 32+7, - 12, 32+12, 13, 32+13, - 14, 32+14, 15, 32+15, - 20, 32+20, 21, 32+21, - 22, 32+22, 23, 32+23, - 28, 32+28, 29, 32+29, - 30, 32+30, 31, 32+31); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_unpackhi_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_unpackhi_epi16(__A, __B), - (__v32hi)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_unpackhi_epi16(__mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_unpackhi_epi16(__A, __B), - (__v32hi)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_unpacklo_epi8(__m512i __A, __m512i __B) { - return (__m512i)__builtin_shufflevector((__v64qi)__A, (__v64qi)__B, - 0, 64+0, 1, 64+1, - 2, 64+2, 3, 64+3, - 4, 64+4, 5, 64+5, - 6, 64+6, 7, 64+7, - 16, 64+16, 17, 64+17, - 18, 64+18, 19, 64+19, - 20, 64+20, 21, 64+21, - 22, 64+22, 23, 64+23, - 32, 64+32, 33, 64+33, - 34, 64+34, 35, 64+35, - 36, 64+36, 37, 64+37, - 38, 64+38, 39, 64+39, - 48, 64+48, 49, 64+49, - 50, 64+50, 51, 64+51, - 52, 64+52, 53, 64+53, - 54, 64+54, 55, 64+55); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_unpacklo_epi8(__m512i __W, __mmask64 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, - (__v64qi)_mm512_unpacklo_epi8(__A, __B), - (__v64qi)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_unpacklo_epi8(__mmask64 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__U, - (__v64qi)_mm512_unpacklo_epi8(__A, __B), - (__v64qi)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_unpacklo_epi16(__m512i __A, __m512i __B) { - return (__m512i)__builtin_shufflevector((__v32hi)__A, (__v32hi)__B, - 0, 32+0, 1, 32+1, - 2, 32+2, 3, 32+3, - 8, 32+8, 9, 32+9, - 10, 32+10, 11, 32+11, - 16, 32+16, 17, 32+17, - 18, 32+18, 19, 32+19, - 24, 32+24, 25, 32+25, - 26, 32+26, 27, 32+27); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_unpacklo_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_unpacklo_epi16(__A, __B), - (__v32hi)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_unpacklo_epi16(__mmask32 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_unpacklo_epi16(__A, __B), - (__v32hi)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_cvtepi8_epi16(__m256i __A) -{ - /* This function always performs a signed extension, but __v32qi is a char - which may be signed or unsigned, so use __v32qs. */ - return (__m512i)__builtin_convertvector((__v32qs)__A, __v32hi); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtepi8_epi16(__m512i __W, __mmask32 __U, __m256i __A) -{ - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_cvtepi8_epi16(__A), - (__v32hi)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtepi8_epi16(__mmask32 __U, __m256i __A) -{ - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_cvtepi8_epi16(__A), - (__v32hi)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_cvtepu8_epi16(__m256i __A) -{ - return (__m512i)__builtin_convertvector((__v32qu)__A, __v32hi); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtepu8_epi16(__m512i __W, __mmask32 __U, __m256i __A) -{ - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_cvtepu8_epi16(__A), - (__v32hi)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtepu8_epi16(__mmask32 __U, __m256i __A) -{ - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_cvtepu8_epi16(__A), - (__v32hi)_mm512_setzero_si512()); -} - - -#define _mm512_shufflehi_epi16(A, imm) \ - ((__m512i)__builtin_ia32_pshufhw512((__v32hi)(__m512i)(A), (int)(imm))) - -#define _mm512_mask_shufflehi_epi16(W, U, A, imm) \ - ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ - (__v32hi)_mm512_shufflehi_epi16((A), \ - (imm)), \ - (__v32hi)(__m512i)(W))) - -#define _mm512_maskz_shufflehi_epi16(U, A, imm) \ - ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ - (__v32hi)_mm512_shufflehi_epi16((A), \ - (imm)), \ - (__v32hi)_mm512_setzero_si512())) - -#define _mm512_shufflelo_epi16(A, imm) \ - ((__m512i)__builtin_ia32_pshuflw512((__v32hi)(__m512i)(A), (int)(imm))) - - -#define _mm512_mask_shufflelo_epi16(W, U, A, imm) \ - ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ - (__v32hi)_mm512_shufflelo_epi16((A), \ - (imm)), \ - (__v32hi)(__m512i)(W))) - - -#define _mm512_maskz_shufflelo_epi16(U, A, imm) \ - ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ - (__v32hi)_mm512_shufflelo_epi16((A), \ - (imm)), \ - (__v32hi)_mm512_setzero_si512())) - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_sllv_epi16(__m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_psllv32hi((__v32hi) __A, (__v32hi) __B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_sllv_epi16 (__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_sllv_epi16(__A, __B), - (__v32hi)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_sllv_epi16(__mmask32 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_sllv_epi16(__A, __B), - (__v32hi)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_sll_epi16(__m512i __A, __m128i __B) -{ - return (__m512i)__builtin_ia32_psllw512((__v32hi) __A, (__v8hi) __B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_sll_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B) -{ - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_sll_epi16(__A, __B), - (__v32hi)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_sll_epi16(__mmask32 __U, __m512i __A, __m128i __B) -{ - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_sll_epi16(__A, __B), - (__v32hi)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_slli_epi16(__m512i __A, unsigned int __B) -{ -#if (__clang_major__ > 14) - return (__m512i)__builtin_ia32_psllwi512((__v32hi)__A, (int)__B); -#else - return (__m512i)__builtin_ia32_psllwi512((__v32hi)__A, __B); -#endif -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_slli_epi16(__m512i __W, __mmask32 __U, __m512i __A, - unsigned int __B) -{ - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_slli_epi16(__A, __B), - (__v32hi)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_slli_epi16(__mmask32 __U, __m512i __A, unsigned int __B) -{ - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_slli_epi16(__A, __B), - (__v32hi)_mm512_setzero_si512()); -} - -#define _mm512_bslli_epi128(a, imm) \ - ((__m512i)__builtin_ia32_pslldqi512_byteshift((__v8di)(__m512i)(a), (int)(imm))) - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_srlv_epi16(__m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_psrlv32hi((__v32hi)__A, (__v32hi)__B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_srlv_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_srlv_epi16(__A, __B), - (__v32hi)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_srlv_epi16(__mmask32 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_srlv_epi16(__A, __B), - (__v32hi)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_srav_epi16(__m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_psrav32hi((__v32hi)__A, (__v32hi)__B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_srav_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_srav_epi16(__A, __B), - (__v32hi)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_srav_epi16(__mmask32 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_srav_epi16(__A, __B), - (__v32hi)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_sra_epi16(__m512i __A, __m128i __B) -{ - return (__m512i)__builtin_ia32_psraw512((__v32hi) __A, (__v8hi) __B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_sra_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B) -{ - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_sra_epi16(__A, __B), - (__v32hi)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_sra_epi16(__mmask32 __U, __m512i __A, __m128i __B) -{ - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_sra_epi16(__A, __B), - (__v32hi)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_srai_epi16(__m512i __A, unsigned int __B) -{ -#if (__clang_major__ > 14) - return (__m512i)__builtin_ia32_psrawi512((__v32hi)__A, (int)__B); -#else - return (__m512i)__builtin_ia32_psrawi512((__v32hi)__A, __B); -#endif -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_srai_epi16(__m512i __W, __mmask32 __U, __m512i __A, - unsigned int __B) -{ - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_srai_epi16(__A, __B), - (__v32hi)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_srai_epi16(__mmask32 __U, __m512i __A, unsigned int __B) -{ - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_srai_epi16(__A, __B), - (__v32hi)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_srl_epi16(__m512i __A, __m128i __B) -{ - return (__m512i)__builtin_ia32_psrlw512((__v32hi) __A, (__v8hi) __B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_srl_epi16(__m512i __W, __mmask32 __U, __m512i __A, __m128i __B) -{ - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_srl_epi16(__A, __B), - (__v32hi)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_srl_epi16(__mmask32 __U, __m512i __A, __m128i __B) -{ - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_srl_epi16(__A, __B), - (__v32hi)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_srli_epi16(__m512i __A, unsigned int __B) -{ - return (__m512i)__builtin_ia32_psrlwi512((__v32hi)__A, __B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_srli_epi16(__m512i __W, __mmask32 __U, __m512i __A, - unsigned int __B) -{ - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_srli_epi16(__A, __B), - (__v32hi)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_srli_epi16(__mmask32 __U, __m512i __A, int __B) -{ - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__U, - (__v32hi)_mm512_srli_epi16(__A, __B), - (__v32hi)_mm512_setzero_si512()); -} - -#define _mm512_bsrli_epi128(a, imm) \ - ((__m512i)__builtin_ia32_psrldqi512_byteshift((__v8di)(__m512i)(a), (int)(imm))) - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_mov_epi16 (__m512i __W, __mmask32 __U, __m512i __A) -{ - return (__m512i) __builtin_ia32_selectw_512 ((__mmask32) __U, - (__v32hi) __A, - (__v32hi) __W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_mov_epi16 (__mmask32 __U, __m512i __A) -{ - return (__m512i) __builtin_ia32_selectw_512 ((__mmask32) __U, - (__v32hi) __A, - (__v32hi) _mm512_setzero_si512 ()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_mov_epi8 (__m512i __W, __mmask64 __U, __m512i __A) -{ - return (__m512i) __builtin_ia32_selectb_512 ((__mmask64) __U, - (__v64qi) __A, - (__v64qi) __W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_mov_epi8 (__mmask64 __U, __m512i __A) -{ - return (__m512i) __builtin_ia32_selectb_512 ((__mmask64) __U, - (__v64qi) __A, - (__v64qi) _mm512_setzero_si512 ()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_set1_epi8 (__m512i __O, __mmask64 __M, char __A) -{ - return (__m512i) __builtin_ia32_selectb_512(__M, - (__v64qi)_mm512_set1_epi8(__A), - (__v64qi) __O); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_set1_epi8 (__mmask64 __M, char __A) -{ - return (__m512i) __builtin_ia32_selectb_512(__M, - (__v64qi) _mm512_set1_epi8(__A), - (__v64qi) _mm512_setzero_si512()); -} - -static __inline__ __mmask64 __DEFAULT_FN_ATTRS -_mm512_kunpackd (__mmask64 __A, __mmask64 __B) -{ - return (__mmask64) __builtin_ia32_kunpckdi ((__mmask64) __A, - (__mmask64) __B); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS -_mm512_kunpackw (__mmask32 __A, __mmask32 __B) -{ - return (__mmask32) __builtin_ia32_kunpcksi ((__mmask32) __A, - (__mmask32) __B); -} - -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_loadu_epi16 (void const *__P) -{ - struct __loadu_epi16 { - __m512i_u __v; - } __attribute__((__packed__, __may_alias__)); - return ((const struct __loadu_epi16*)__P)->__v; -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_loadu_epi16 (__m512i __W, __mmask32 __U, void const *__P) -{ - return (__m512i) __builtin_ia32_loaddquhi512_mask ((const __v32hi *) __P, - (__v32hi) __W, - (__mmask32) __U); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_loadu_epi16 (__mmask32 __U, void const *__P) -{ - return (__m512i) __builtin_ia32_loaddquhi512_mask ((const __v32hi *) __P, - (__v32hi) - _mm512_setzero_si512 (), - (__mmask32) __U); -} - -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_loadu_epi8 (void const *__P) -{ - struct __loadu_epi8 { - __m512i_u __v; - } __attribute__((__packed__, __may_alias__)); - return ((const struct __loadu_epi8*)__P)->__v; -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_loadu_epi8 (__m512i __W, __mmask64 __U, void const *__P) -{ - return (__m512i) __builtin_ia32_loaddquqi512_mask ((const __v64qi *) __P, - (__v64qi) __W, - (__mmask64) __U); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_loadu_epi8 (__mmask64 __U, void const *__P) -{ - return (__m512i) __builtin_ia32_loaddquqi512_mask ((const __v64qi *) __P, - (__v64qi) - _mm512_setzero_si512 (), - (__mmask64) __U); -} - -static __inline void __DEFAULT_FN_ATTRS512 -_mm512_storeu_epi16 (void *__P, __m512i __A) -{ - struct __storeu_epi16 { - __m512i_u __v; - } __attribute__((__packed__, __may_alias__)); - ((struct __storeu_epi16*)__P)->__v = __A; -} - -static __inline__ void __DEFAULT_FN_ATTRS512 -_mm512_mask_storeu_epi16 (void *__P, __mmask32 __U, __m512i __A) -{ - __builtin_ia32_storedquhi512_mask ((__v32hi *) __P, - (__v32hi) __A, - (__mmask32) __U); -} - -static __inline void __DEFAULT_FN_ATTRS512 -_mm512_storeu_epi8 (void *__P, __m512i __A) -{ - struct __storeu_epi8 { - __m512i_u __v; - } __attribute__((__packed__, __may_alias__)); - ((struct __storeu_epi8*)__P)->__v = __A; -} - -static __inline__ void __DEFAULT_FN_ATTRS512 -_mm512_mask_storeu_epi8 (void *__P, __mmask64 __U, __m512i __A) -{ - __builtin_ia32_storedquqi512_mask ((__v64qi *) __P, - (__v64qi) __A, - (__mmask64) __U); -} - -static __inline__ __mmask64 __DEFAULT_FN_ATTRS512 -_mm512_test_epi8_mask (__m512i __A, __m512i __B) -{ - return _mm512_cmpneq_epi8_mask (_mm512_and_epi32 (__A, __B), - _mm512_setzero_si512()); -} - -static __inline__ __mmask64 __DEFAULT_FN_ATTRS512 -_mm512_mask_test_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B) -{ - return _mm512_mask_cmpneq_epi8_mask (__U, _mm512_and_epi32 (__A, __B), - _mm512_setzero_si512()); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS512 -_mm512_test_epi16_mask (__m512i __A, __m512i __B) -{ - return _mm512_cmpneq_epi16_mask (_mm512_and_epi32 (__A, __B), - _mm512_setzero_si512()); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS512 -_mm512_mask_test_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B) -{ - return _mm512_mask_cmpneq_epi16_mask (__U, _mm512_and_epi32 (__A, __B), - _mm512_setzero_si512()); -} - -static __inline__ __mmask64 __DEFAULT_FN_ATTRS512 -_mm512_testn_epi8_mask (__m512i __A, __m512i __B) -{ - return _mm512_cmpeq_epi8_mask (_mm512_and_epi32 (__A, __B), _mm512_setzero_si512()); -} - -static __inline__ __mmask64 __DEFAULT_FN_ATTRS512 -_mm512_mask_testn_epi8_mask (__mmask64 __U, __m512i __A, __m512i __B) -{ - return _mm512_mask_cmpeq_epi8_mask (__U, _mm512_and_epi32 (__A, __B), - _mm512_setzero_si512()); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS512 -_mm512_testn_epi16_mask (__m512i __A, __m512i __B) -{ - return _mm512_cmpeq_epi16_mask (_mm512_and_epi32 (__A, __B), - _mm512_setzero_si512()); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS512 -_mm512_mask_testn_epi16_mask (__mmask32 __U, __m512i __A, __m512i __B) -{ - return _mm512_mask_cmpeq_epi16_mask (__U, _mm512_and_epi32 (__A, __B), - _mm512_setzero_si512()); -} - -static __inline__ __mmask64 __DEFAULT_FN_ATTRS512 -_mm512_movepi8_mask (__m512i __A) -{ - return (__mmask64) __builtin_ia32_cvtb2mask512 ((__v64qi) __A); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS512 -_mm512_movepi16_mask (__m512i __A) -{ - return (__mmask32) __builtin_ia32_cvtw2mask512 ((__v32hi) __A); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_movm_epi8 (__mmask64 __A) -{ - return (__m512i) __builtin_ia32_cvtmask2b512 (__A); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_movm_epi16 (__mmask32 __A) -{ - return (__m512i) __builtin_ia32_cvtmask2w512 (__A); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_broadcastb_epi8 (__m128i __A) -{ - return (__m512i)__builtin_shufflevector((__v16qi) __A, (__v16qi) __A, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_broadcastb_epi8 (__m512i __O, __mmask64 __M, __m128i __A) -{ - return (__m512i)__builtin_ia32_selectb_512(__M, - (__v64qi) _mm512_broadcastb_epi8(__A), - (__v64qi) __O); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_broadcastb_epi8 (__mmask64 __M, __m128i __A) -{ - return (__m512i)__builtin_ia32_selectb_512(__M, - (__v64qi) _mm512_broadcastb_epi8(__A), - (__v64qi) _mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_set1_epi16 (__m512i __O, __mmask32 __M, short __A) -{ - return (__m512i) __builtin_ia32_selectw_512(__M, - (__v32hi) _mm512_set1_epi16(__A), - (__v32hi) __O); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_set1_epi16 (__mmask32 __M, short __A) -{ - return (__m512i) __builtin_ia32_selectw_512(__M, - (__v32hi) _mm512_set1_epi16(__A), - (__v32hi) _mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_broadcastw_epi16 (__m128i __A) -{ - return (__m512i)__builtin_shufflevector((__v8hi) __A, (__v8hi) __A, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_broadcastw_epi16 (__m512i __O, __mmask32 __M, __m128i __A) -{ - return (__m512i)__builtin_ia32_selectw_512(__M, - (__v32hi) _mm512_broadcastw_epi16(__A), - (__v32hi) __O); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_broadcastw_epi16 (__mmask32 __M, __m128i __A) -{ - return (__m512i)__builtin_ia32_selectw_512(__M, - (__v32hi) _mm512_broadcastw_epi16(__A), - (__v32hi) _mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_permutexvar_epi16 (__m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_permutexvar_epi16 (__mmask32 __M, __m512i __A, - __m512i __B) -{ - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, - (__v32hi)_mm512_permutexvar_epi16(__A, __B), - (__v32hi)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_permutexvar_epi16 (__m512i __W, __mmask32 __M, __m512i __A, - __m512i __B) -{ - return (__m512i)__builtin_ia32_selectw_512((__mmask32)__M, - (__v32hi)_mm512_permutexvar_epi16(__A, __B), - (__v32hi)__W); -} - -#define _mm512_alignr_epi8(A, B, N) \ - ((__m512i)__builtin_ia32_palignr512((__v64qi)(__m512i)(A), \ - (__v64qi)(__m512i)(B), (int)(N))) - -#define _mm512_mask_alignr_epi8(W, U, A, B, N) \ - ((__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \ - (__v64qi)_mm512_alignr_epi8((A), (B), (int)(N)), \ - (__v64qi)(__m512i)(W))) - -#define _mm512_maskz_alignr_epi8(U, A, B, N) \ - ((__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \ - (__v64qi)_mm512_alignr_epi8((A), (B), (int)(N)), \ - (__v64qi)(__m512i)_mm512_setzero_si512())) - -#define _mm512_dbsad_epu8(A, B, imm) \ - ((__m512i)__builtin_ia32_dbpsadbw512((__v64qi)(__m512i)(A), \ - (__v64qi)(__m512i)(B), (int)(imm))) - -#define _mm512_mask_dbsad_epu8(W, U, A, B, imm) \ - ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ - (__v32hi)_mm512_dbsad_epu8((A), (B), (imm)), \ - (__v32hi)(__m512i)(W))) - -#define _mm512_maskz_dbsad_epu8(U, A, B, imm) \ - ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ - (__v32hi)_mm512_dbsad_epu8((A), (B), (imm)), \ - (__v32hi)_mm512_setzero_si512())) - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_sad_epu8 (__m512i __A, __m512i __B) -{ - return (__m512i) __builtin_ia32_psadbw512 ((__v64qi) __A, - (__v64qi) __B); -} - -#undef __DEFAULT_FN_ATTRS512 -#undef __DEFAULT_FN_ATTRS - -#endif diff --git a/include/avx512cdintrin.h b/include/avx512cdintrin.h deleted file mode 100644 index bfdba84..0000000 --- a/include/avx512cdintrin.h +++ /dev/null @@ -1,123 +0,0 @@ -/*===------------- avx512cdintrin.h - AVX512CD intrinsics ------------------=== - * - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ -#ifndef __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __AVX512CDINTRIN_H -#define __AVX512CDINTRIN_H - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512cd"), __min_vector_width__(512))) - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_conflict_epi64 (__m512i __A) -{ - return (__m512i) __builtin_ia32_vpconflictdi_512 ((__v8di) __A); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_conflict_epi64 (__m512i __W, __mmask8 __U, __m512i __A) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_conflict_epi64(__A), - (__v8di)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_conflict_epi64 (__mmask8 __U, __m512i __A) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_conflict_epi64(__A), - (__v8di)_mm512_setzero_si512 ()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_conflict_epi32 (__m512i __A) -{ - return (__m512i) __builtin_ia32_vpconflictsi_512 ((__v16si) __A); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_conflict_epi32 (__m512i __W, __mmask16 __U, __m512i __A) -{ - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_conflict_epi32(__A), - (__v16si)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_conflict_epi32 (__mmask16 __U, __m512i __A) -{ - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_conflict_epi32(__A), - (__v16si)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_lzcnt_epi32 (__m512i __A) -{ - return (__m512i) __builtin_ia32_vplzcntd_512 ((__v16si) __A); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_lzcnt_epi32 (__m512i __W, __mmask16 __U, __m512i __A) -{ - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_lzcnt_epi32(__A), - (__v16si)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_lzcnt_epi32 (__mmask16 __U, __m512i __A) -{ - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_lzcnt_epi32(__A), - (__v16si)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_lzcnt_epi64 (__m512i __A) -{ - return (__m512i) __builtin_ia32_vplzcntq_512 ((__v8di) __A); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_lzcnt_epi64 (__m512i __W, __mmask8 __U, __m512i __A) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_lzcnt_epi64(__A), - (__v8di)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_lzcnt_epi64 (__mmask8 __U, __m512i __A) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_lzcnt_epi64(__A), - (__v8di)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_broadcastmb_epi64 (__mmask8 __A) -{ - return (__m512i) _mm512_set1_epi64((long long) __A); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_broadcastmw_epi32 (__mmask16 __A) -{ - return (__m512i) _mm512_set1_epi32((int) __A); - -} - -#undef __DEFAULT_FN_ATTRS - -#endif diff --git a/include/avx512dqintrin.h b/include/avx512dqintrin.h deleted file mode 100644 index 3ba0a0c..0000000 --- a/include/avx512dqintrin.h +++ /dev/null @@ -1,1377 +0,0 @@ -/*===---- avx512dqintrin.h - AVX512DQ intrinsics ---------------------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ - -#ifndef __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __AVX512DQINTRIN_H -#define __AVX512DQINTRIN_H - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS512 __attribute__((__always_inline__, __nodebug__, __target__("avx512dq"), __min_vector_width__(512))) -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512dq"))) - -static __inline __mmask8 __DEFAULT_FN_ATTRS -_knot_mask8(__mmask8 __M) -{ - return __builtin_ia32_knotqi(__M); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_kand_mask8(__mmask8 __A, __mmask8 __B) -{ - return (__mmask8)__builtin_ia32_kandqi((__mmask8)__A, (__mmask8)__B); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_kandn_mask8(__mmask8 __A, __mmask8 __B) -{ - return (__mmask8)__builtin_ia32_kandnqi((__mmask8)__A, (__mmask8)__B); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_kor_mask8(__mmask8 __A, __mmask8 __B) -{ - return (__mmask8)__builtin_ia32_korqi((__mmask8)__A, (__mmask8)__B); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_kxnor_mask8(__mmask8 __A, __mmask8 __B) -{ - return (__mmask8)__builtin_ia32_kxnorqi((__mmask8)__A, (__mmask8)__B); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_kxor_mask8(__mmask8 __A, __mmask8 __B) -{ - return (__mmask8)__builtin_ia32_kxorqi((__mmask8)__A, (__mmask8)__B); -} - -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_kortestc_mask8_u8(__mmask8 __A, __mmask8 __B) -{ - return (unsigned char)__builtin_ia32_kortestcqi(__A, __B); -} - -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_kortestz_mask8_u8(__mmask8 __A, __mmask8 __B) -{ - return (unsigned char)__builtin_ia32_kortestzqi(__A, __B); -} - -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_kortest_mask8_u8(__mmask8 __A, __mmask8 __B, unsigned char *__C) { - *__C = (unsigned char)__builtin_ia32_kortestcqi(__A, __B); - return (unsigned char)__builtin_ia32_kortestzqi(__A, __B); -} - -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_ktestc_mask8_u8(__mmask8 __A, __mmask8 __B) -{ - return (unsigned char)__builtin_ia32_ktestcqi(__A, __B); -} - -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_ktestz_mask8_u8(__mmask8 __A, __mmask8 __B) -{ - return (unsigned char)__builtin_ia32_ktestzqi(__A, __B); -} - -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_ktest_mask8_u8(__mmask8 __A, __mmask8 __B, unsigned char *__C) { - *__C = (unsigned char)__builtin_ia32_ktestcqi(__A, __B); - return (unsigned char)__builtin_ia32_ktestzqi(__A, __B); -} - -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_ktestc_mask16_u8(__mmask16 __A, __mmask16 __B) -{ - return (unsigned char)__builtin_ia32_ktestchi(__A, __B); -} - -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_ktestz_mask16_u8(__mmask16 __A, __mmask16 __B) -{ - return (unsigned char)__builtin_ia32_ktestzhi(__A, __B); -} - -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_ktest_mask16_u8(__mmask16 __A, __mmask16 __B, unsigned char *__C) { - *__C = (unsigned char)__builtin_ia32_ktestchi(__A, __B); - return (unsigned char)__builtin_ia32_ktestzhi(__A, __B); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_kadd_mask8(__mmask8 __A, __mmask8 __B) -{ - return (__mmask8)__builtin_ia32_kaddqi((__mmask8)__A, (__mmask8)__B); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_kadd_mask16(__mmask16 __A, __mmask16 __B) -{ - return (__mmask16)__builtin_ia32_kaddhi((__mmask16)__A, (__mmask16)__B); -} - -#define _kshiftli_mask8(A, I) \ - ((__mmask8)__builtin_ia32_kshiftliqi((__mmask8)(A), (unsigned int)(I))) - -#define _kshiftri_mask8(A, I) \ - ((__mmask8)__builtin_ia32_kshiftriqi((__mmask8)(A), (unsigned int)(I))) - -static __inline__ unsigned int __DEFAULT_FN_ATTRS -_cvtmask8_u32(__mmask8 __A) { - return (unsigned int)__builtin_ia32_kmovb((__mmask8)__A); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_cvtu32_mask8(unsigned int __A) { - return (__mmask8)__builtin_ia32_kmovb((__mmask8)__A); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS -_load_mask8(__mmask8 *__A) { - return (__mmask8)__builtin_ia32_kmovb(*(__mmask8 *)__A); -} - -static __inline__ void __DEFAULT_FN_ATTRS -_store_mask8(__mmask8 *__A, __mmask8 __B) { - *(__mmask8 *)__A = __builtin_ia32_kmovb((__mmask8)__B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mullo_epi64 (__m512i __A, __m512i __B) { - return (__m512i) ((__v8du) __A * (__v8du) __B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_mullo_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_mullo_epi64(__A, __B), - (__v8di)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_mullo_epi64(__mmask8 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_mullo_epi64(__A, __B), - (__v8di)_mm512_setzero_si512()); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_xor_pd(__m512d __A, __m512d __B) { - return (__m512d)((__v8du)__A ^ (__v8du)__B); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_xor_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, - (__v8df)_mm512_xor_pd(__A, __B), - (__v8df)__W); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_maskz_xor_pd(__mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, - (__v8df)_mm512_xor_pd(__A, __B), - (__v8df)_mm512_setzero_pd()); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_xor_ps (__m512 __A, __m512 __B) { - return (__m512)((__v16su)__A ^ (__v16su)__B); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_xor_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_xor_ps(__A, __B), - (__v16sf)__W); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_maskz_xor_ps(__mmask16 __U, __m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_xor_ps(__A, __B), - (__v16sf)_mm512_setzero_ps()); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_or_pd(__m512d __A, __m512d __B) { - return (__m512d)((__v8du)__A | (__v8du)__B); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_or_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, - (__v8df)_mm512_or_pd(__A, __B), - (__v8df)__W); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_maskz_or_pd(__mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, - (__v8df)_mm512_or_pd(__A, __B), - (__v8df)_mm512_setzero_pd()); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_or_ps(__m512 __A, __m512 __B) { - return (__m512)((__v16su)__A | (__v16su)__B); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_or_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_or_ps(__A, __B), - (__v16sf)__W); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_maskz_or_ps(__mmask16 __U, __m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_or_ps(__A, __B), - (__v16sf)_mm512_setzero_ps()); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_and_pd(__m512d __A, __m512d __B) { - return (__m512d)((__v8du)__A & (__v8du)__B); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_and_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, - (__v8df)_mm512_and_pd(__A, __B), - (__v8df)__W); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_maskz_and_pd(__mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, - (__v8df)_mm512_and_pd(__A, __B), - (__v8df)_mm512_setzero_pd()); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_and_ps(__m512 __A, __m512 __B) { - return (__m512)((__v16su)__A & (__v16su)__B); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_and_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_and_ps(__A, __B), - (__v16sf)__W); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_maskz_and_ps(__mmask16 __U, __m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_and_ps(__A, __B), - (__v16sf)_mm512_setzero_ps()); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_andnot_pd(__m512d __A, __m512d __B) { - return (__m512d)(~(__v8du)__A & (__v8du)__B); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_andnot_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, - (__v8df)_mm512_andnot_pd(__A, __B), - (__v8df)__W); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_maskz_andnot_pd(__mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, - (__v8df)_mm512_andnot_pd(__A, __B), - (__v8df)_mm512_setzero_pd()); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_andnot_ps(__m512 __A, __m512 __B) { - return (__m512)(~(__v16su)__A & (__v16su)__B); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_andnot_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_andnot_ps(__A, __B), - (__v16sf)__W); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_maskz_andnot_ps(__mmask16 __U, __m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_andnot_ps(__A, __B), - (__v16sf)_mm512_setzero_ps()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_cvtpd_epi64 (__m512d __A) { - return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A, - (__v8di) _mm512_setzero_si512(), - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtpd_epi64 (__m512i __W, __mmask8 __U, __m512d __A) { - return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A, - (__v8di) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtpd_epi64 (__mmask8 __U, __m512d __A) { - return (__m512i) __builtin_ia32_cvtpd2qq512_mask ((__v8df) __A, - (__v8di) _mm512_setzero_si512(), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_cvt_roundpd_epi64(A, R) \ - ((__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)-1, (int)(R))) - -#define _mm512_mask_cvt_roundpd_epi64(W, U, A, R) \ - ((__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \ - (__v8di)(__m512i)(W), \ - (__mmask8)(U), (int)(R))) - -#define _mm512_maskz_cvt_roundpd_epi64(U, A, R) \ - ((__m512i)__builtin_ia32_cvtpd2qq512_mask((__v8df)(__m512d)(A), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)(U), (int)(R))) - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_cvtpd_epu64 (__m512d __A) { - return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A, - (__v8di) _mm512_setzero_si512(), - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtpd_epu64 (__m512i __W, __mmask8 __U, __m512d __A) { - return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A, - (__v8di) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtpd_epu64 (__mmask8 __U, __m512d __A) { - return (__m512i) __builtin_ia32_cvtpd2uqq512_mask ((__v8df) __A, - (__v8di) _mm512_setzero_si512(), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_cvt_roundpd_epu64(A, R) \ - ((__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)-1, (int)(R))) - -#define _mm512_mask_cvt_roundpd_epu64(W, U, A, R) \ - ((__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \ - (__v8di)(__m512i)(W), \ - (__mmask8)(U), (int)(R))) - -#define _mm512_maskz_cvt_roundpd_epu64(U, A, R) \ - ((__m512i)__builtin_ia32_cvtpd2uqq512_mask((__v8df)(__m512d)(A), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)(U), (int)(R))) - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_cvtps_epi64 (__m256 __A) { - return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A, - (__v8di) _mm512_setzero_si512(), - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtps_epi64 (__m512i __W, __mmask8 __U, __m256 __A) { - return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A, - (__v8di) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtps_epi64 (__mmask8 __U, __m256 __A) { - return (__m512i) __builtin_ia32_cvtps2qq512_mask ((__v8sf) __A, - (__v8di) _mm512_setzero_si512(), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_cvt_roundps_epi64(A, R) \ - ((__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)-1, (int)(R))) - -#define _mm512_mask_cvt_roundps_epi64(W, U, A, R) \ - ((__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \ - (__v8di)(__m512i)(W), \ - (__mmask8)(U), (int)(R))) - -#define _mm512_maskz_cvt_roundps_epi64(U, A, R) \ - ((__m512i)__builtin_ia32_cvtps2qq512_mask((__v8sf)(__m256)(A), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)(U), (int)(R))) - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_cvtps_epu64 (__m256 __A) { - return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A, - (__v8di) _mm512_setzero_si512(), - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtps_epu64 (__m512i __W, __mmask8 __U, __m256 __A) { - return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A, - (__v8di) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtps_epu64 (__mmask8 __U, __m256 __A) { - return (__m512i) __builtin_ia32_cvtps2uqq512_mask ((__v8sf) __A, - (__v8di) _mm512_setzero_si512(), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_cvt_roundps_epu64(A, R) \ - ((__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)-1, (int)(R))) - -#define _mm512_mask_cvt_roundps_epu64(W, U, A, R) \ - ((__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \ - (__v8di)(__m512i)(W), \ - (__mmask8)(U), (int)(R))) - -#define _mm512_maskz_cvt_roundps_epu64(U, A, R) \ - ((__m512i)__builtin_ia32_cvtps2uqq512_mask((__v8sf)(__m256)(A), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)(U), (int)(R))) - - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_cvtepi64_pd (__m512i __A) { - return (__m512d)__builtin_convertvector((__v8di)__A, __v8df); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtepi64_pd (__m512d __W, __mmask8 __U, __m512i __A) { - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, - (__v8df)_mm512_cvtepi64_pd(__A), - (__v8df)__W); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtepi64_pd (__mmask8 __U, __m512i __A) { - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, - (__v8df)_mm512_cvtepi64_pd(__A), - (__v8df)_mm512_setzero_pd()); -} - -#define _mm512_cvt_roundepi64_pd(A, R) \ - ((__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)-1, (int)(R))) - -#define _mm512_mask_cvt_roundepi64_pd(W, U, A, R) \ - ((__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \ - (__v8df)(__m512d)(W), \ - (__mmask8)(U), (int)(R))) - -#define _mm512_maskz_cvt_roundepi64_pd(U, A, R) \ - ((__m512d)__builtin_ia32_cvtqq2pd512_mask((__v8di)(__m512i)(A), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)(U), (int)(R))) - -static __inline__ __m256 __DEFAULT_FN_ATTRS512 -_mm512_cvtepi64_ps (__m512i __A) { - return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A, - (__v8sf) _mm256_setzero_ps(), - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtepi64_ps (__m256 __W, __mmask8 __U, __m512i __A) { - return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A, - (__v8sf) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtepi64_ps (__mmask8 __U, __m512i __A) { - return (__m256) __builtin_ia32_cvtqq2ps512_mask ((__v8di) __A, - (__v8sf) _mm256_setzero_ps(), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_cvt_roundepi64_ps(A, R) \ - ((__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \ - (__v8sf)_mm256_setzero_ps(), \ - (__mmask8)-1, (int)(R))) - -#define _mm512_mask_cvt_roundepi64_ps(W, U, A, R) \ - ((__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \ - (__v8sf)(__m256)(W), (__mmask8)(U), \ - (int)(R))) - -#define _mm512_maskz_cvt_roundepi64_ps(U, A, R) \ - ((__m256)__builtin_ia32_cvtqq2ps512_mask((__v8di)(__m512i)(A), \ - (__v8sf)_mm256_setzero_ps(), \ - (__mmask8)(U), (int)(R))) - - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_cvttpd_epi64 (__m512d __A) { - return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A, - (__v8di) _mm512_setzero_si512(), - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvttpd_epi64 (__m512i __W, __mmask8 __U, __m512d __A) { - return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A, - (__v8di) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvttpd_epi64 (__mmask8 __U, __m512d __A) { - return (__m512i) __builtin_ia32_cvttpd2qq512_mask ((__v8df) __A, - (__v8di) _mm512_setzero_si512(), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_cvtt_roundpd_epi64(A, R) \ - ((__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)-1, (int)(R))) - -#define _mm512_mask_cvtt_roundpd_epi64(W, U, A, R) \ - ((__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \ - (__v8di)(__m512i)(W), \ - (__mmask8)(U), (int)(R))) - -#define _mm512_maskz_cvtt_roundpd_epi64(U, A, R) \ - ((__m512i)__builtin_ia32_cvttpd2qq512_mask((__v8df)(__m512d)(A), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)(U), (int)(R))) - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_cvttpd_epu64 (__m512d __A) { - return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A, - (__v8di) _mm512_setzero_si512(), - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvttpd_epu64 (__m512i __W, __mmask8 __U, __m512d __A) { - return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A, - (__v8di) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvttpd_epu64 (__mmask8 __U, __m512d __A) { - return (__m512i) __builtin_ia32_cvttpd2uqq512_mask ((__v8df) __A, - (__v8di) _mm512_setzero_si512(), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_cvtt_roundpd_epu64(A, R) \ - ((__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)-1, (int)(R))) - -#define _mm512_mask_cvtt_roundpd_epu64(W, U, A, R) \ - ((__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \ - (__v8di)(__m512i)(W), \ - (__mmask8)(U), (int)(R))) - -#define _mm512_maskz_cvtt_roundpd_epu64(U, A, R) \ - ((__m512i)__builtin_ia32_cvttpd2uqq512_mask((__v8df)(__m512d)(A), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)(U), (int)(R))) - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_cvttps_epi64 (__m256 __A) { - return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A, - (__v8di) _mm512_setzero_si512(), - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvttps_epi64 (__m512i __W, __mmask8 __U, __m256 __A) { - return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A, - (__v8di) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvttps_epi64 (__mmask8 __U, __m256 __A) { - return (__m512i) __builtin_ia32_cvttps2qq512_mask ((__v8sf) __A, - (__v8di) _mm512_setzero_si512(), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_cvtt_roundps_epi64(A, R) \ - ((__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)-1, (int)(R))) - -#define _mm512_mask_cvtt_roundps_epi64(W, U, A, R) \ - ((__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \ - (__v8di)(__m512i)(W), \ - (__mmask8)(U), (int)(R))) - -#define _mm512_maskz_cvtt_roundps_epi64(U, A, R) \ - ((__m512i)__builtin_ia32_cvttps2qq512_mask((__v8sf)(__m256)(A), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)(U), (int)(R))) - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_cvttps_epu64 (__m256 __A) { - return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A, - (__v8di) _mm512_setzero_si512(), - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvttps_epu64 (__m512i __W, __mmask8 __U, __m256 __A) { - return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A, - (__v8di) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvttps_epu64 (__mmask8 __U, __m256 __A) { - return (__m512i) __builtin_ia32_cvttps2uqq512_mask ((__v8sf) __A, - (__v8di) _mm512_setzero_si512(), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_cvtt_roundps_epu64(A, R) \ - ((__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)-1, (int)(R))) - -#define _mm512_mask_cvtt_roundps_epu64(W, U, A, R) \ - ((__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \ - (__v8di)(__m512i)(W), \ - (__mmask8)(U), (int)(R))) - -#define _mm512_maskz_cvtt_roundps_epu64(U, A, R) \ - ((__m512i)__builtin_ia32_cvttps2uqq512_mask((__v8sf)(__m256)(A), \ - (__v8di)_mm512_setzero_si512(), \ - (__mmask8)(U), (int)(R))) - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_cvtepu64_pd (__m512i __A) { - return (__m512d)__builtin_convertvector((__v8du)__A, __v8df); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtepu64_pd (__m512d __W, __mmask8 __U, __m512i __A) { - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, - (__v8df)_mm512_cvtepu64_pd(__A), - (__v8df)__W); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtepu64_pd (__mmask8 __U, __m512i __A) { - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, - (__v8df)_mm512_cvtepu64_pd(__A), - (__v8df)_mm512_setzero_pd()); -} - -#define _mm512_cvt_roundepu64_pd(A, R) \ - ((__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)-1, (int)(R))) - -#define _mm512_mask_cvt_roundepu64_pd(W, U, A, R) \ - ((__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \ - (__v8df)(__m512d)(W), \ - (__mmask8)(U), (int)(R))) - - -#define _mm512_maskz_cvt_roundepu64_pd(U, A, R) \ - ((__m512d)__builtin_ia32_cvtuqq2pd512_mask((__v8di)(__m512i)(A), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)(U), (int)(R))) - - -static __inline__ __m256 __DEFAULT_FN_ATTRS512 -_mm512_cvtepu64_ps (__m512i __A) { - return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A, - (__v8sf) _mm256_setzero_ps(), - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtepu64_ps (__m256 __W, __mmask8 __U, __m512i __A) { - return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A, - (__v8sf) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtepu64_ps (__mmask8 __U, __m512i __A) { - return (__m256) __builtin_ia32_cvtuqq2ps512_mask ((__v8di) __A, - (__v8sf) _mm256_setzero_ps(), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_cvt_roundepu64_ps(A, R) \ - ((__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \ - (__v8sf)_mm256_setzero_ps(), \ - (__mmask8)-1, (int)(R))) - -#define _mm512_mask_cvt_roundepu64_ps(W, U, A, R) \ - ((__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \ - (__v8sf)(__m256)(W), (__mmask8)(U), \ - (int)(R))) - -#define _mm512_maskz_cvt_roundepu64_ps(U, A, R) \ - ((__m256)__builtin_ia32_cvtuqq2ps512_mask((__v8di)(__m512i)(A), \ - (__v8sf)_mm256_setzero_ps(), \ - (__mmask8)(U), (int)(R))) - -#define _mm512_range_pd(A, B, C) \ - ((__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), (int)(C), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)-1, \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_mask_range_pd(W, U, A, B, C) \ - ((__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), (int)(C), \ - (__v8df)(__m512d)(W), (__mmask8)(U), \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_maskz_range_pd(U, A, B, C) \ - ((__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), (int)(C), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)(U), \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_range_round_pd(A, B, C, R) \ - ((__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), (int)(C), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)-1, (int)(R))) - -#define _mm512_mask_range_round_pd(W, U, A, B, C, R) \ - ((__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), (int)(C), \ - (__v8df)(__m512d)(W), (__mmask8)(U), \ - (int)(R))) - -#define _mm512_maskz_range_round_pd(U, A, B, C, R) \ - ((__m512d)__builtin_ia32_rangepd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), (int)(C), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)(U), (int)(R))) - -#define _mm512_range_ps(A, B, C) \ - ((__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), (int)(C), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)-1, \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_mask_range_ps(W, U, A, B, C) \ - ((__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), (int)(C), \ - (__v16sf)(__m512)(W), (__mmask16)(U), \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_maskz_range_ps(U, A, B, C) \ - ((__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), (int)(C), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)(U), \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_range_round_ps(A, B, C, R) \ - ((__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), (int)(C), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)-1, (int)(R))) - -#define _mm512_mask_range_round_ps(W, U, A, B, C, R) \ - ((__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), (int)(C), \ - (__v16sf)(__m512)(W), (__mmask16)(U), \ - (int)(R))) - -#define _mm512_maskz_range_round_ps(U, A, B, C, R) \ - ((__m512)__builtin_ia32_rangeps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), (int)(C), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)(U), (int)(R))) - -#define _mm_range_round_ss(A, B, C, R) \ - ((__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8) -1, (int)(C),\ - (int)(R))) - -#define _mm_range_ss(A ,B , C) _mm_range_round_ss(A, B, C ,_MM_FROUND_CUR_DIRECTION) - -#define _mm_mask_range_round_ss(W, U, A, B, C, R) \ - ((__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)(__m128)(W),\ - (__mmask8)(U), (int)(C),\ - (int)(R))) - -#define _mm_mask_range_ss(W , U, A, B, C) _mm_mask_range_round_ss(W, U, A, B, C , _MM_FROUND_CUR_DIRECTION) - -#define _mm_maskz_range_round_ss(U, A, B, C, R) \ - ((__m128)__builtin_ia32_rangess128_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U), (int)(C),\ - (int)(R))) - -#define _mm_maskz_range_ss(U, A ,B , C) _mm_maskz_range_round_ss(U, A, B, C ,_MM_FROUND_CUR_DIRECTION) - -#define _mm_range_round_sd(A, B, C, R) \ - ((__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8) -1, (int)(C),\ - (int)(R))) - -#define _mm_range_sd(A ,B , C) _mm_range_round_sd(A, B, C ,_MM_FROUND_CUR_DIRECTION) - -#define _mm_mask_range_round_sd(W, U, A, B, C, R) \ - ((__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)(__m128d)(W),\ - (__mmask8)(U), (int)(C),\ - (int)(R))) - -#define _mm_mask_range_sd(W, U, A, B, C) _mm_mask_range_round_sd(W, U, A, B, C ,_MM_FROUND_CUR_DIRECTION) - -#define _mm_maskz_range_round_sd(U, A, B, C, R) \ - ((__m128d)__builtin_ia32_rangesd128_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U), (int)(C),\ - (int)(R))) - -#define _mm_maskz_range_sd(U, A, B, C) _mm_maskz_range_round_sd(U, A, B, C ,_MM_FROUND_CUR_DIRECTION) - -#define _mm512_reduce_pd(A, B) \ - ((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)-1, \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_mask_reduce_pd(W, U, A, B) \ - ((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \ - (__v8df)(__m512d)(W), \ - (__mmask8)(U), \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_maskz_reduce_pd(U, A, B) \ - ((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)(U), \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_reduce_ps(A, B) \ - ((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)-1, \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_mask_reduce_ps(W, U, A, B) \ - ((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \ - (__v16sf)(__m512)(W), \ - (__mmask16)(U), \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_maskz_reduce_ps(U, A, B) \ - ((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)(U), \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_reduce_round_pd(A, B, R) \ - ((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)-1, (int)(R))) - -#define _mm512_mask_reduce_round_pd(W, U, A, B, R) \ - ((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \ - (__v8df)(__m512d)(W), \ - (__mmask8)(U), (int)(R))) - -#define _mm512_maskz_reduce_round_pd(U, A, B, R) \ - ((__m512d)__builtin_ia32_reducepd512_mask((__v8df)(__m512d)(A), (int)(B), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)(U), (int)(R))) - -#define _mm512_reduce_round_ps(A, B, R) \ - ((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)-1, (int)(R))) - -#define _mm512_mask_reduce_round_ps(W, U, A, B, R) \ - ((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \ - (__v16sf)(__m512)(W), \ - (__mmask16)(U), (int)(R))) - -#define _mm512_maskz_reduce_round_ps(U, A, B, R) \ - ((__m512)__builtin_ia32_reduceps512_mask((__v16sf)(__m512)(A), (int)(B), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)(U), (int)(R))) - -#define _mm_reduce_ss(A, B, C) \ - ((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), (__mmask8)-1, \ - (int)(C), _MM_FROUND_CUR_DIRECTION)) - -#define _mm_mask_reduce_ss(W, U, A, B, C) \ - ((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)(__m128)(W), (__mmask8)(U), \ - (int)(C), _MM_FROUND_CUR_DIRECTION)) - -#define _mm_maskz_reduce_ss(U, A, B, C) \ - ((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U), (int)(C), \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm_reduce_round_ss(A, B, C, R) \ - ((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), (__mmask8)-1, \ - (int)(C), (int)(R))) - -#define _mm_mask_reduce_round_ss(W, U, A, B, C, R) \ - ((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)(__m128)(W), (__mmask8)(U), \ - (int)(C), (int)(R))) - -#define _mm_maskz_reduce_round_ss(U, A, B, C, R) \ - ((__m128)__builtin_ia32_reducess_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U), (int)(C), (int)(R))) - -#define _mm_reduce_sd(A, B, C) \ - ((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1, (int)(C), \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm_mask_reduce_sd(W, U, A, B, C) \ - ((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)(__m128d)(W), (__mmask8)(U), \ - (int)(C), _MM_FROUND_CUR_DIRECTION)) - -#define _mm_maskz_reduce_sd(U, A, B, C) \ - ((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U), (int)(C), \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm_reduce_round_sd(A, B, C, R) \ - ((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1, (int)(C), (int)(R))) - -#define _mm_mask_reduce_round_sd(W, U, A, B, C, R) \ - ((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)(__m128d)(W), (__mmask8)(U), \ - (int)(C), (int)(R))) - -#define _mm_maskz_reduce_round_sd(U, A, B, C, R) \ - ((__m128d)__builtin_ia32_reducesd_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U), (int)(C), (int)(R))) - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS512 -_mm512_movepi32_mask (__m512i __A) -{ - return (__mmask16) __builtin_ia32_cvtd2mask512 ((__v16si) __A); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_movm_epi32 (__mmask16 __A) -{ - return (__m512i) __builtin_ia32_cvtmask2d512 (__A); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_movm_epi64 (__mmask8 __A) -{ - return (__m512i) __builtin_ia32_cvtmask2q512 (__A); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS512 -_mm512_movepi64_mask (__m512i __A) -{ - return (__mmask8) __builtin_ia32_cvtq2mask512 ((__v8di) __A); -} - - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_broadcast_f32x2 (__m128 __A) -{ - return (__m512)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A, - 0, 1, 0, 1, 0, 1, 0, 1, - 0, 1, 0, 1, 0, 1, 0, 1); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_broadcast_f32x2 (__m512 __O, __mmask16 __M, __m128 __A) -{ - return (__m512)__builtin_ia32_selectps_512((__mmask16)__M, - (__v16sf)_mm512_broadcast_f32x2(__A), - (__v16sf)__O); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_maskz_broadcast_f32x2 (__mmask16 __M, __m128 __A) -{ - return (__m512)__builtin_ia32_selectps_512((__mmask16)__M, - (__v16sf)_mm512_broadcast_f32x2(__A), - (__v16sf)_mm512_setzero_ps()); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_broadcast_f32x8(__m256 __A) -{ - return (__m512)__builtin_shufflevector((__v8sf)__A, (__v8sf)__A, - 0, 1, 2, 3, 4, 5, 6, 7, - 0, 1, 2, 3, 4, 5, 6, 7); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_broadcast_f32x8(__m512 __O, __mmask16 __M, __m256 __A) -{ - return (__m512)__builtin_ia32_selectps_512((__mmask16)__M, - (__v16sf)_mm512_broadcast_f32x8(__A), - (__v16sf)__O); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_maskz_broadcast_f32x8(__mmask16 __M, __m256 __A) -{ - return (__m512)__builtin_ia32_selectps_512((__mmask16)__M, - (__v16sf)_mm512_broadcast_f32x8(__A), - (__v16sf)_mm512_setzero_ps()); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_broadcast_f64x2(__m128d __A) -{ - return (__m512d)__builtin_shufflevector((__v2df)__A, (__v2df)__A, - 0, 1, 0, 1, 0, 1, 0, 1); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_broadcast_f64x2(__m512d __O, __mmask8 __M, __m128d __A) -{ - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M, - (__v8df)_mm512_broadcast_f64x2(__A), - (__v8df)__O); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_maskz_broadcast_f64x2(__mmask8 __M, __m128d __A) -{ - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M, - (__v8df)_mm512_broadcast_f64x2(__A), - (__v8df)_mm512_setzero_pd()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_broadcast_i32x2 (__m128i __A) -{ - return (__m512i)__builtin_shufflevector((__v4si)__A, (__v4si)__A, - 0, 1, 0, 1, 0, 1, 0, 1, - 0, 1, 0, 1, 0, 1, 0, 1); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_broadcast_i32x2 (__m512i __O, __mmask16 __M, __m128i __A) -{ - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, - (__v16si)_mm512_broadcast_i32x2(__A), - (__v16si)__O); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_broadcast_i32x2 (__mmask16 __M, __m128i __A) -{ - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, - (__v16si)_mm512_broadcast_i32x2(__A), - (__v16si)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_broadcast_i32x8(__m256i __A) -{ - return (__m512i)__builtin_shufflevector((__v8si)__A, (__v8si)__A, - 0, 1, 2, 3, 4, 5, 6, 7, - 0, 1, 2, 3, 4, 5, 6, 7); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_broadcast_i32x8(__m512i __O, __mmask16 __M, __m256i __A) -{ - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, - (__v16si)_mm512_broadcast_i32x8(__A), - (__v16si)__O); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_broadcast_i32x8(__mmask16 __M, __m256i __A) -{ - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, - (__v16si)_mm512_broadcast_i32x8(__A), - (__v16si)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_broadcast_i64x2(__m128i __A) -{ - return (__m512i)__builtin_shufflevector((__v2di)__A, (__v2di)__A, - 0, 1, 0, 1, 0, 1, 0, 1); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_broadcast_i64x2(__m512i __O, __mmask8 __M, __m128i __A) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, - (__v8di)_mm512_broadcast_i64x2(__A), - (__v8di)__O); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_broadcast_i64x2(__mmask8 __M, __m128i __A) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, - (__v8di)_mm512_broadcast_i64x2(__A), - (__v8di)_mm512_setzero_si512()); -} - -#define _mm512_extractf32x8_ps(A, imm) \ - ((__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \ - (__v8sf)_mm256_undefined_ps(), \ - (__mmask8)-1)) - -#define _mm512_mask_extractf32x8_ps(W, U, A, imm) \ - ((__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \ - (__v8sf)(__m256)(W), \ - (__mmask8)(U))) - -#define _mm512_maskz_extractf32x8_ps(U, A, imm) \ - ((__m256)__builtin_ia32_extractf32x8_mask((__v16sf)(__m512)(A), (int)(imm), \ - (__v8sf)_mm256_setzero_ps(), \ - (__mmask8)(U))) - -#define _mm512_extractf64x2_pd(A, imm) \ - ((__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \ - (int)(imm), \ - (__v2df)_mm_undefined_pd(), \ - (__mmask8)-1)) - -#define _mm512_mask_extractf64x2_pd(W, U, A, imm) \ - ((__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \ - (int)(imm), \ - (__v2df)(__m128d)(W), \ - (__mmask8)(U))) - -#define _mm512_maskz_extractf64x2_pd(U, A, imm) \ - ((__m128d)__builtin_ia32_extractf64x2_512_mask((__v8df)(__m512d)(A), \ - (int)(imm), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U))) - -#define _mm512_extracti32x8_epi32(A, imm) \ - ((__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \ - (__v8si)_mm256_undefined_si256(), \ - (__mmask8)-1)) - -#define _mm512_mask_extracti32x8_epi32(W, U, A, imm) \ - ((__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \ - (__v8si)(__m256i)(W), \ - (__mmask8)(U))) - -#define _mm512_maskz_extracti32x8_epi32(U, A, imm) \ - ((__m256i)__builtin_ia32_extracti32x8_mask((__v16si)(__m512i)(A), (int)(imm), \ - (__v8si)_mm256_setzero_si256(), \ - (__mmask8)(U))) - -#define _mm512_extracti64x2_epi64(A, imm) \ - ((__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \ - (int)(imm), \ - (__v2di)_mm_undefined_si128(), \ - (__mmask8)-1)) - -#define _mm512_mask_extracti64x2_epi64(W, U, A, imm) \ - ((__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \ - (int)(imm), \ - (__v2di)(__m128i)(W), \ - (__mmask8)(U))) - -#define _mm512_maskz_extracti64x2_epi64(U, A, imm) \ - ((__m128i)__builtin_ia32_extracti64x2_512_mask((__v8di)(__m512i)(A), \ - (int)(imm), \ - (__v2di)_mm_setzero_si128(), \ - (__mmask8)(U))) - -#define _mm512_insertf32x8(A, B, imm) \ - ((__m512)__builtin_ia32_insertf32x8((__v16sf)(__m512)(A), \ - (__v8sf)(__m256)(B), (int)(imm))) - -#define _mm512_mask_insertf32x8(W, U, A, B, imm) \ - ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - (__v16sf)_mm512_insertf32x8((A), (B), (imm)), \ - (__v16sf)(__m512)(W))) - -#define _mm512_maskz_insertf32x8(U, A, B, imm) \ - ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - (__v16sf)_mm512_insertf32x8((A), (B), (imm)), \ - (__v16sf)_mm512_setzero_ps())) - -#define _mm512_insertf64x2(A, B, imm) \ - ((__m512d)__builtin_ia32_insertf64x2_512((__v8df)(__m512d)(A), \ - (__v2df)(__m128d)(B), (int)(imm))) - -#define _mm512_mask_insertf64x2(W, U, A, B, imm) \ - ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_insertf64x2((A), (B), (imm)), \ - (__v8df)(__m512d)(W))) - -#define _mm512_maskz_insertf64x2(U, A, B, imm) \ - ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_insertf64x2((A), (B), (imm)), \ - (__v8df)_mm512_setzero_pd())) - -#define _mm512_inserti32x8(A, B, imm) \ - ((__m512i)__builtin_ia32_inserti32x8((__v16si)(__m512i)(A), \ - (__v8si)(__m256i)(B), (int)(imm))) - -#define _mm512_mask_inserti32x8(W, U, A, B, imm) \ - ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ - (__v16si)_mm512_inserti32x8((A), (B), (imm)), \ - (__v16si)(__m512i)(W))) - -#define _mm512_maskz_inserti32x8(U, A, B, imm) \ - ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ - (__v16si)_mm512_inserti32x8((A), (B), (imm)), \ - (__v16si)_mm512_setzero_si512())) - -#define _mm512_inserti64x2(A, B, imm) \ - ((__m512i)__builtin_ia32_inserti64x2_512((__v8di)(__m512i)(A), \ - (__v2di)(__m128i)(B), (int)(imm))) - -#define _mm512_mask_inserti64x2(W, U, A, B, imm) \ - ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ - (__v8di)_mm512_inserti64x2((A), (B), (imm)), \ - (__v8di)(__m512i)(W))) - -#define _mm512_maskz_inserti64x2(U, A, B, imm) \ - ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ - (__v8di)_mm512_inserti64x2((A), (B), (imm)), \ - (__v8di)_mm512_setzero_si512())) - -#define _mm512_mask_fpclass_ps_mask(U, A, imm) \ - ((__mmask16)__builtin_ia32_fpclassps512_mask((__v16sf)(__m512)(A), \ - (int)(imm), (__mmask16)(U))) - -#define _mm512_fpclass_ps_mask(A, imm) \ - ((__mmask16)__builtin_ia32_fpclassps512_mask((__v16sf)(__m512)(A), \ - (int)(imm), (__mmask16)-1)) - -#define _mm512_mask_fpclass_pd_mask(U, A, imm) \ - ((__mmask8)__builtin_ia32_fpclasspd512_mask((__v8df)(__m512d)(A), (int)(imm), \ - (__mmask8)(U))) - -#define _mm512_fpclass_pd_mask(A, imm) \ - ((__mmask8)__builtin_ia32_fpclasspd512_mask((__v8df)(__m512d)(A), (int)(imm), \ - (__mmask8)-1)) - -#define _mm_fpclass_sd_mask(A, imm) \ - ((__mmask8)__builtin_ia32_fpclasssd_mask((__v2df)(__m128d)(A), (int)(imm), \ - (__mmask8)-1)) - -#define _mm_mask_fpclass_sd_mask(U, A, imm) \ - ((__mmask8)__builtin_ia32_fpclasssd_mask((__v2df)(__m128d)(A), (int)(imm), \ - (__mmask8)(U))) - -#define _mm_fpclass_ss_mask(A, imm) \ - ((__mmask8)__builtin_ia32_fpclassss_mask((__v4sf)(__m128)(A), (int)(imm), \ - (__mmask8)-1)) - -#define _mm_mask_fpclass_ss_mask(U, A, imm) \ - ((__mmask8)__builtin_ia32_fpclassss_mask((__v4sf)(__m128)(A), (int)(imm), \ - (__mmask8)(U))) - -#undef __DEFAULT_FN_ATTRS512 -#undef __DEFAULT_FN_ATTRS - -#endif diff --git a/include/avx512erintrin.h b/include/avx512erintrin.h deleted file mode 100644 index 1c5a2d2..0000000 --- a/include/avx512erintrin.h +++ /dev/null @@ -1,271 +0,0 @@ -/*===---- avx512erintrin.h - AVX512ER intrinsics ---------------------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ -#ifndef __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __AVX512ERINTRIN_H -#define __AVX512ERINTRIN_H - -/* exp2a23 */ -#define _mm512_exp2a23_round_pd(A, R) \ - ((__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)-1, (int)(R))) - -#define _mm512_mask_exp2a23_round_pd(S, M, A, R) \ - ((__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(S), (__mmask8)(M), \ - (int)(R))) - -#define _mm512_maskz_exp2a23_round_pd(M, A, R) \ - ((__m512d)__builtin_ia32_exp2pd_mask((__v8df)(__m512d)(A), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)(M), (int)(R))) - -#define _mm512_exp2a23_pd(A) \ - _mm512_exp2a23_round_pd((A), _MM_FROUND_CUR_DIRECTION) - -#define _mm512_mask_exp2a23_pd(S, M, A) \ - _mm512_mask_exp2a23_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION) - -#define _mm512_maskz_exp2a23_pd(M, A) \ - _mm512_maskz_exp2a23_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION) - -#define _mm512_exp2a23_round_ps(A, R) \ - ((__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)-1, (int)(R))) - -#define _mm512_mask_exp2a23_round_ps(S, M, A, R) \ - ((__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(S), (__mmask16)(M), \ - (int)(R))) - -#define _mm512_maskz_exp2a23_round_ps(M, A, R) \ - ((__m512)__builtin_ia32_exp2ps_mask((__v16sf)(__m512)(A), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)(M), (int)(R))) - -#define _mm512_exp2a23_ps(A) \ - _mm512_exp2a23_round_ps((A), _MM_FROUND_CUR_DIRECTION) - -#define _mm512_mask_exp2a23_ps(S, M, A) \ - _mm512_mask_exp2a23_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION) - -#define _mm512_maskz_exp2a23_ps(M, A) \ - _mm512_maskz_exp2a23_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION) - -/* rsqrt28 */ -#define _mm512_rsqrt28_round_pd(A, R) \ - ((__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)-1, (int)(R))) - -#define _mm512_mask_rsqrt28_round_pd(S, M, A, R) \ - ((__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(S), (__mmask8)(M), \ - (int)(R))) - -#define _mm512_maskz_rsqrt28_round_pd(M, A, R) \ - ((__m512d)__builtin_ia32_rsqrt28pd_mask((__v8df)(__m512d)(A), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)(M), (int)(R))) - -#define _mm512_rsqrt28_pd(A) \ - _mm512_rsqrt28_round_pd((A), _MM_FROUND_CUR_DIRECTION) - -#define _mm512_mask_rsqrt28_pd(S, M, A) \ - _mm512_mask_rsqrt28_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION) - -#define _mm512_maskz_rsqrt28_pd(M, A) \ - _mm512_maskz_rsqrt28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION) - -#define _mm512_rsqrt28_round_ps(A, R) \ - ((__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)-1, (int)(R))) - -#define _mm512_mask_rsqrt28_round_ps(S, M, A, R) \ - ((__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(S), (__mmask16)(M), \ - (int)(R))) - -#define _mm512_maskz_rsqrt28_round_ps(M, A, R) \ - ((__m512)__builtin_ia32_rsqrt28ps_mask((__v16sf)(__m512)(A), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)(M), (int)(R))) - -#define _mm512_rsqrt28_ps(A) \ - _mm512_rsqrt28_round_ps((A), _MM_FROUND_CUR_DIRECTION) - -#define _mm512_mask_rsqrt28_ps(S, M, A) \ - _mm512_mask_rsqrt28_round_ps((S), (M), A, _MM_FROUND_CUR_DIRECTION) - -#define _mm512_maskz_rsqrt28_ps(M, A) \ - _mm512_maskz_rsqrt28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION) - -#define _mm_rsqrt28_round_ss(A, B, R) \ - ((__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)-1, (int)(R))) - -#define _mm_mask_rsqrt28_round_ss(S, M, A, B, R) \ - ((__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)(__m128)(S), \ - (__mmask8)(M), (int)(R))) - -#define _mm_maskz_rsqrt28_round_ss(M, A, B, R) \ - ((__m128)__builtin_ia32_rsqrt28ss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(M), (int)(R))) - -#define _mm_rsqrt28_ss(A, B) \ - _mm_rsqrt28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION) - -#define _mm_mask_rsqrt28_ss(S, M, A, B) \ - _mm_mask_rsqrt28_round_ss((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION) - -#define _mm_maskz_rsqrt28_ss(M, A, B) \ - _mm_maskz_rsqrt28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION) - -#define _mm_rsqrt28_round_sd(A, B, R) \ - ((__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1, (int)(R))) - -#define _mm_mask_rsqrt28_round_sd(S, M, A, B, R) \ - ((__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)(__m128d)(S), \ - (__mmask8)(M), (int)(R))) - -#define _mm_maskz_rsqrt28_round_sd(M, A, B, R) \ - ((__m128d)__builtin_ia32_rsqrt28sd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)(M), (int)(R))) - -#define _mm_rsqrt28_sd(A, B) \ - _mm_rsqrt28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION) - -#define _mm_mask_rsqrt28_sd(S, M, A, B) \ - _mm_mask_rsqrt28_round_sd((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION) - -#define _mm_maskz_rsqrt28_sd(M, A, B) \ - _mm_maskz_rsqrt28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION) - -/* rcp28 */ -#define _mm512_rcp28_round_pd(A, R) \ - ((__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)-1, (int)(R))) - -#define _mm512_mask_rcp28_round_pd(S, M, A, R) \ - ((__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(S), (__mmask8)(M), \ - (int)(R))) - -#define _mm512_maskz_rcp28_round_pd(M, A, R) \ - ((__m512d)__builtin_ia32_rcp28pd_mask((__v8df)(__m512d)(A), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)(M), (int)(R))) - -#define _mm512_rcp28_pd(A) \ - _mm512_rcp28_round_pd((A), _MM_FROUND_CUR_DIRECTION) - -#define _mm512_mask_rcp28_pd(S, M, A) \ - _mm512_mask_rcp28_round_pd((S), (M), (A), _MM_FROUND_CUR_DIRECTION) - -#define _mm512_maskz_rcp28_pd(M, A) \ - _mm512_maskz_rcp28_round_pd((M), (A), _MM_FROUND_CUR_DIRECTION) - -#define _mm512_rcp28_round_ps(A, R) \ - ((__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)-1, (int)(R))) - -#define _mm512_mask_rcp28_round_ps(S, M, A, R) \ - ((__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(S), (__mmask16)(M), \ - (int)(R))) - -#define _mm512_maskz_rcp28_round_ps(M, A, R) \ - ((__m512)__builtin_ia32_rcp28ps_mask((__v16sf)(__m512)(A), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)(M), (int)(R))) - -#define _mm512_rcp28_ps(A) \ - _mm512_rcp28_round_ps((A), _MM_FROUND_CUR_DIRECTION) - -#define _mm512_mask_rcp28_ps(S, M, A) \ - _mm512_mask_rcp28_round_ps((S), (M), (A), _MM_FROUND_CUR_DIRECTION) - -#define _mm512_maskz_rcp28_ps(M, A) \ - _mm512_maskz_rcp28_round_ps((M), (A), _MM_FROUND_CUR_DIRECTION) - -#define _mm_rcp28_round_ss(A, B, R) \ - ((__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)-1, (int)(R))) - -#define _mm_mask_rcp28_round_ss(S, M, A, B, R) \ - ((__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)(__m128)(S), \ - (__mmask8)(M), (int)(R))) - -#define _mm_maskz_rcp28_round_ss(M, A, B, R) \ - ((__m128)__builtin_ia32_rcp28ss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(M), (int)(R))) - -#define _mm_rcp28_ss(A, B) \ - _mm_rcp28_round_ss((A), (B), _MM_FROUND_CUR_DIRECTION) - -#define _mm_mask_rcp28_ss(S, M, A, B) \ - _mm_mask_rcp28_round_ss((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION) - -#define _mm_maskz_rcp28_ss(M, A, B) \ - _mm_maskz_rcp28_round_ss((M), (A), (B), _MM_FROUND_CUR_DIRECTION) - -#define _mm_rcp28_round_sd(A, B, R) \ - ((__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1, (int)(R))) - -#define _mm_mask_rcp28_round_sd(S, M, A, B, R) \ - ((__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)(__m128d)(S), \ - (__mmask8)(M), (int)(R))) - -#define _mm_maskz_rcp28_round_sd(M, A, B, R) \ - ((__m128d)__builtin_ia32_rcp28sd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)(M), (int)(R))) - -#define _mm_rcp28_sd(A, B) \ - _mm_rcp28_round_sd((A), (B), _MM_FROUND_CUR_DIRECTION) - -#define _mm_mask_rcp28_sd(S, M, A, B) \ - _mm_mask_rcp28_round_sd((S), (M), (A), (B), _MM_FROUND_CUR_DIRECTION) - -#define _mm_maskz_rcp28_sd(M, A, B) \ - _mm_maskz_rcp28_round_sd((M), (A), (B), _MM_FROUND_CUR_DIRECTION) - -#endif /* __AVX512ERINTRIN_H */ diff --git a/include/avx512fintrin.h b/include/avx512fintrin.h deleted file mode 100644 index cd1dc82..0000000 --- a/include/avx512fintrin.h +++ /dev/null @@ -1,9930 +0,0 @@ -/*===---- avx512fintrin.h - AVX512F intrinsics -----------------------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ -#ifndef __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __AVX512FINTRIN_H -#define __AVX512FINTRIN_H - -typedef char __v64qi __attribute__((__vector_size__(64))); -typedef short __v32hi __attribute__((__vector_size__(64))); -typedef double __v8df __attribute__((__vector_size__(64))); -typedef float __v16sf __attribute__((__vector_size__(64))); -typedef long long __v8di __attribute__((__vector_size__(64))); -typedef int __v16si __attribute__((__vector_size__(64))); - -/* Unsigned types */ -typedef unsigned char __v64qu __attribute__((__vector_size__(64))); -typedef unsigned short __v32hu __attribute__((__vector_size__(64))); -typedef unsigned long long __v8du __attribute__((__vector_size__(64))); -typedef unsigned int __v16su __attribute__((__vector_size__(64))); - -/* We need an explicitly signed variant for char. Note that this shouldn't - * appear in the interface though. */ -typedef signed char __v64qs __attribute__((__vector_size__(64))); - -typedef float __m512 __attribute__((__vector_size__(64), __aligned__(64))); -typedef double __m512d __attribute__((__vector_size__(64), __aligned__(64))); -typedef long long __m512i __attribute__((__vector_size__(64), __aligned__(64))); - -typedef float __m512_u __attribute__((__vector_size__(64), __aligned__(1))); -typedef double __m512d_u __attribute__((__vector_size__(64), __aligned__(1))); -typedef long long __m512i_u __attribute__((__vector_size__(64), __aligned__(1))); - -typedef unsigned char __mmask8; -typedef unsigned short __mmask16; - -/* Rounding mode macros. */ -#define _MM_FROUND_TO_NEAREST_INT 0x00 -#define _MM_FROUND_TO_NEG_INF 0x01 -#define _MM_FROUND_TO_POS_INF 0x02 -#define _MM_FROUND_TO_ZERO 0x03 -#define _MM_FROUND_CUR_DIRECTION 0x04 - -/* Constants for integer comparison predicates */ -typedef enum { - _MM_CMPINT_EQ, /* Equal */ - _MM_CMPINT_LT, /* Less than */ - _MM_CMPINT_LE, /* Less than or Equal */ - _MM_CMPINT_UNUSED, - _MM_CMPINT_NE, /* Not Equal */ - _MM_CMPINT_NLT, /* Not Less than */ -#define _MM_CMPINT_GE _MM_CMPINT_NLT /* Greater than or Equal */ - _MM_CMPINT_NLE /* Not Less than or Equal */ -#define _MM_CMPINT_GT _MM_CMPINT_NLE /* Greater than */ -} _MM_CMPINT_ENUM; - -typedef enum -{ - _MM_PERM_AAAA = 0x00, _MM_PERM_AAAB = 0x01, _MM_PERM_AAAC = 0x02, - _MM_PERM_AAAD = 0x03, _MM_PERM_AABA = 0x04, _MM_PERM_AABB = 0x05, - _MM_PERM_AABC = 0x06, _MM_PERM_AABD = 0x07, _MM_PERM_AACA = 0x08, - _MM_PERM_AACB = 0x09, _MM_PERM_AACC = 0x0A, _MM_PERM_AACD = 0x0B, - _MM_PERM_AADA = 0x0C, _MM_PERM_AADB = 0x0D, _MM_PERM_AADC = 0x0E, - _MM_PERM_AADD = 0x0F, _MM_PERM_ABAA = 0x10, _MM_PERM_ABAB = 0x11, - _MM_PERM_ABAC = 0x12, _MM_PERM_ABAD = 0x13, _MM_PERM_ABBA = 0x14, - _MM_PERM_ABBB = 0x15, _MM_PERM_ABBC = 0x16, _MM_PERM_ABBD = 0x17, - _MM_PERM_ABCA = 0x18, _MM_PERM_ABCB = 0x19, _MM_PERM_ABCC = 0x1A, - _MM_PERM_ABCD = 0x1B, _MM_PERM_ABDA = 0x1C, _MM_PERM_ABDB = 0x1D, - _MM_PERM_ABDC = 0x1E, _MM_PERM_ABDD = 0x1F, _MM_PERM_ACAA = 0x20, - _MM_PERM_ACAB = 0x21, _MM_PERM_ACAC = 0x22, _MM_PERM_ACAD = 0x23, - _MM_PERM_ACBA = 0x24, _MM_PERM_ACBB = 0x25, _MM_PERM_ACBC = 0x26, - _MM_PERM_ACBD = 0x27, _MM_PERM_ACCA = 0x28, _MM_PERM_ACCB = 0x29, - _MM_PERM_ACCC = 0x2A, _MM_PERM_ACCD = 0x2B, _MM_PERM_ACDA = 0x2C, - _MM_PERM_ACDB = 0x2D, _MM_PERM_ACDC = 0x2E, _MM_PERM_ACDD = 0x2F, - _MM_PERM_ADAA = 0x30, _MM_PERM_ADAB = 0x31, _MM_PERM_ADAC = 0x32, - _MM_PERM_ADAD = 0x33, _MM_PERM_ADBA = 0x34, _MM_PERM_ADBB = 0x35, - _MM_PERM_ADBC = 0x36, _MM_PERM_ADBD = 0x37, _MM_PERM_ADCA = 0x38, - _MM_PERM_ADCB = 0x39, _MM_PERM_ADCC = 0x3A, _MM_PERM_ADCD = 0x3B, - _MM_PERM_ADDA = 0x3C, _MM_PERM_ADDB = 0x3D, _MM_PERM_ADDC = 0x3E, - _MM_PERM_ADDD = 0x3F, _MM_PERM_BAAA = 0x40, _MM_PERM_BAAB = 0x41, - _MM_PERM_BAAC = 0x42, _MM_PERM_BAAD = 0x43, _MM_PERM_BABA = 0x44, - _MM_PERM_BABB = 0x45, _MM_PERM_BABC = 0x46, _MM_PERM_BABD = 0x47, - _MM_PERM_BACA = 0x48, _MM_PERM_BACB = 0x49, _MM_PERM_BACC = 0x4A, - _MM_PERM_BACD = 0x4B, _MM_PERM_BADA = 0x4C, _MM_PERM_BADB = 0x4D, - _MM_PERM_BADC = 0x4E, _MM_PERM_BADD = 0x4F, _MM_PERM_BBAA = 0x50, - _MM_PERM_BBAB = 0x51, _MM_PERM_BBAC = 0x52, _MM_PERM_BBAD = 0x53, - _MM_PERM_BBBA = 0x54, _MM_PERM_BBBB = 0x55, _MM_PERM_BBBC = 0x56, - _MM_PERM_BBBD = 0x57, _MM_PERM_BBCA = 0x58, _MM_PERM_BBCB = 0x59, - _MM_PERM_BBCC = 0x5A, _MM_PERM_BBCD = 0x5B, _MM_PERM_BBDA = 0x5C, - _MM_PERM_BBDB = 0x5D, _MM_PERM_BBDC = 0x5E, _MM_PERM_BBDD = 0x5F, - _MM_PERM_BCAA = 0x60, _MM_PERM_BCAB = 0x61, _MM_PERM_BCAC = 0x62, - _MM_PERM_BCAD = 0x63, _MM_PERM_BCBA = 0x64, _MM_PERM_BCBB = 0x65, - _MM_PERM_BCBC = 0x66, _MM_PERM_BCBD = 0x67, _MM_PERM_BCCA = 0x68, - _MM_PERM_BCCB = 0x69, _MM_PERM_BCCC = 0x6A, _MM_PERM_BCCD = 0x6B, - _MM_PERM_BCDA = 0x6C, _MM_PERM_BCDB = 0x6D, _MM_PERM_BCDC = 0x6E, - _MM_PERM_BCDD = 0x6F, _MM_PERM_BDAA = 0x70, _MM_PERM_BDAB = 0x71, - _MM_PERM_BDAC = 0x72, _MM_PERM_BDAD = 0x73, _MM_PERM_BDBA = 0x74, - _MM_PERM_BDBB = 0x75, _MM_PERM_BDBC = 0x76, _MM_PERM_BDBD = 0x77, - _MM_PERM_BDCA = 0x78, _MM_PERM_BDCB = 0x79, _MM_PERM_BDCC = 0x7A, - _MM_PERM_BDCD = 0x7B, _MM_PERM_BDDA = 0x7C, _MM_PERM_BDDB = 0x7D, - _MM_PERM_BDDC = 0x7E, _MM_PERM_BDDD = 0x7F, _MM_PERM_CAAA = 0x80, - _MM_PERM_CAAB = 0x81, _MM_PERM_CAAC = 0x82, _MM_PERM_CAAD = 0x83, - _MM_PERM_CABA = 0x84, _MM_PERM_CABB = 0x85, _MM_PERM_CABC = 0x86, - _MM_PERM_CABD = 0x87, _MM_PERM_CACA = 0x88, _MM_PERM_CACB = 0x89, - _MM_PERM_CACC = 0x8A, _MM_PERM_CACD = 0x8B, _MM_PERM_CADA = 0x8C, - _MM_PERM_CADB = 0x8D, _MM_PERM_CADC = 0x8E, _MM_PERM_CADD = 0x8F, - _MM_PERM_CBAA = 0x90, _MM_PERM_CBAB = 0x91, _MM_PERM_CBAC = 0x92, - _MM_PERM_CBAD = 0x93, _MM_PERM_CBBA = 0x94, _MM_PERM_CBBB = 0x95, - _MM_PERM_CBBC = 0x96, _MM_PERM_CBBD = 0x97, _MM_PERM_CBCA = 0x98, - _MM_PERM_CBCB = 0x99, _MM_PERM_CBCC = 0x9A, _MM_PERM_CBCD = 0x9B, - _MM_PERM_CBDA = 0x9C, _MM_PERM_CBDB = 0x9D, _MM_PERM_CBDC = 0x9E, - _MM_PERM_CBDD = 0x9F, _MM_PERM_CCAA = 0xA0, _MM_PERM_CCAB = 0xA1, - _MM_PERM_CCAC = 0xA2, _MM_PERM_CCAD = 0xA3, _MM_PERM_CCBA = 0xA4, - _MM_PERM_CCBB = 0xA5, _MM_PERM_CCBC = 0xA6, _MM_PERM_CCBD = 0xA7, - _MM_PERM_CCCA = 0xA8, _MM_PERM_CCCB = 0xA9, _MM_PERM_CCCC = 0xAA, - _MM_PERM_CCCD = 0xAB, _MM_PERM_CCDA = 0xAC, _MM_PERM_CCDB = 0xAD, - _MM_PERM_CCDC = 0xAE, _MM_PERM_CCDD = 0xAF, _MM_PERM_CDAA = 0xB0, - _MM_PERM_CDAB = 0xB1, _MM_PERM_CDAC = 0xB2, _MM_PERM_CDAD = 0xB3, - _MM_PERM_CDBA = 0xB4, _MM_PERM_CDBB = 0xB5, _MM_PERM_CDBC = 0xB6, - _MM_PERM_CDBD = 0xB7, _MM_PERM_CDCA = 0xB8, _MM_PERM_CDCB = 0xB9, - _MM_PERM_CDCC = 0xBA, _MM_PERM_CDCD = 0xBB, _MM_PERM_CDDA = 0xBC, - _MM_PERM_CDDB = 0xBD, _MM_PERM_CDDC = 0xBE, _MM_PERM_CDDD = 0xBF, - _MM_PERM_DAAA = 0xC0, _MM_PERM_DAAB = 0xC1, _MM_PERM_DAAC = 0xC2, - _MM_PERM_DAAD = 0xC3, _MM_PERM_DABA = 0xC4, _MM_PERM_DABB = 0xC5, - _MM_PERM_DABC = 0xC6, _MM_PERM_DABD = 0xC7, _MM_PERM_DACA = 0xC8, - _MM_PERM_DACB = 0xC9, _MM_PERM_DACC = 0xCA, _MM_PERM_DACD = 0xCB, - _MM_PERM_DADA = 0xCC, _MM_PERM_DADB = 0xCD, _MM_PERM_DADC = 0xCE, - _MM_PERM_DADD = 0xCF, _MM_PERM_DBAA = 0xD0, _MM_PERM_DBAB = 0xD1, - _MM_PERM_DBAC = 0xD2, _MM_PERM_DBAD = 0xD3, _MM_PERM_DBBA = 0xD4, - _MM_PERM_DBBB = 0xD5, _MM_PERM_DBBC = 0xD6, _MM_PERM_DBBD = 0xD7, - _MM_PERM_DBCA = 0xD8, _MM_PERM_DBCB = 0xD9, _MM_PERM_DBCC = 0xDA, - _MM_PERM_DBCD = 0xDB, _MM_PERM_DBDA = 0xDC, _MM_PERM_DBDB = 0xDD, - _MM_PERM_DBDC = 0xDE, _MM_PERM_DBDD = 0xDF, _MM_PERM_DCAA = 0xE0, - _MM_PERM_DCAB = 0xE1, _MM_PERM_DCAC = 0xE2, _MM_PERM_DCAD = 0xE3, - _MM_PERM_DCBA = 0xE4, _MM_PERM_DCBB = 0xE5, _MM_PERM_DCBC = 0xE6, - _MM_PERM_DCBD = 0xE7, _MM_PERM_DCCA = 0xE8, _MM_PERM_DCCB = 0xE9, - _MM_PERM_DCCC = 0xEA, _MM_PERM_DCCD = 0xEB, _MM_PERM_DCDA = 0xEC, - _MM_PERM_DCDB = 0xED, _MM_PERM_DCDC = 0xEE, _MM_PERM_DCDD = 0xEF, - _MM_PERM_DDAA = 0xF0, _MM_PERM_DDAB = 0xF1, _MM_PERM_DDAC = 0xF2, - _MM_PERM_DDAD = 0xF3, _MM_PERM_DDBA = 0xF4, _MM_PERM_DDBB = 0xF5, - _MM_PERM_DDBC = 0xF6, _MM_PERM_DDBD = 0xF7, _MM_PERM_DDCA = 0xF8, - _MM_PERM_DDCB = 0xF9, _MM_PERM_DDCC = 0xFA, _MM_PERM_DDCD = 0xFB, - _MM_PERM_DDDA = 0xFC, _MM_PERM_DDDB = 0xFD, _MM_PERM_DDDC = 0xFE, - _MM_PERM_DDDD = 0xFF -} _MM_PERM_ENUM; - -typedef enum -{ - _MM_MANT_NORM_1_2, /* interval [1, 2) */ - _MM_MANT_NORM_p5_2, /* interval [0.5, 2) */ - _MM_MANT_NORM_p5_1, /* interval [0.5, 1) */ - _MM_MANT_NORM_p75_1p5 /* interval [0.75, 1.5) */ -} _MM_MANTISSA_NORM_ENUM; - -typedef enum -{ - _MM_MANT_SIGN_src, /* sign = sign(SRC) */ - _MM_MANT_SIGN_zero, /* sign = 0 */ - _MM_MANT_SIGN_nan /* DEST = NaN if sign(SRC) = 1 */ -} _MM_MANTISSA_SIGN_ENUM; - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS512 __attribute__((__always_inline__, __nodebug__, __target__("avx512f"), __min_vector_width__(512))) -#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512f"), __min_vector_width__(128))) -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512f"))) - -/* Create vectors with repeated elements */ - -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_setzero_si512(void) -{ - return __extension__ (__m512i)(__v8di){ 0, 0, 0, 0, 0, 0, 0, 0 }; -} - -#define _mm512_setzero_epi32 _mm512_setzero_si512 - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_undefined_pd(void) -{ - return (__m512d)__builtin_ia32_undef512(); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_undefined(void) -{ - return (__m512)__builtin_ia32_undef512(); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_undefined_ps(void) -{ - return (__m512)__builtin_ia32_undef512(); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_undefined_epi32(void) -{ - return (__m512i)__builtin_ia32_undef512(); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_broadcastd_epi32 (__m128i __A) -{ - return (__m512i)__builtin_shufflevector((__v4si) __A, (__v4si) __A, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_broadcastd_epi32 (__m512i __O, __mmask16 __M, __m128i __A) -{ - return (__m512i)__builtin_ia32_selectd_512(__M, - (__v16si) _mm512_broadcastd_epi32(__A), - (__v16si) __O); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_broadcastd_epi32 (__mmask16 __M, __m128i __A) -{ - return (__m512i)__builtin_ia32_selectd_512(__M, - (__v16si) _mm512_broadcastd_epi32(__A), - (__v16si) _mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_broadcastq_epi64 (__m128i __A) -{ - return (__m512i)__builtin_shufflevector((__v2di) __A, (__v2di) __A, - 0, 0, 0, 0, 0, 0, 0, 0); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_broadcastq_epi64 (__m512i __O, __mmask8 __M, __m128i __A) -{ - return (__m512i)__builtin_ia32_selectq_512(__M, - (__v8di) _mm512_broadcastq_epi64(__A), - (__v8di) __O); - -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A) -{ - return (__m512i)__builtin_ia32_selectq_512(__M, - (__v8di) _mm512_broadcastq_epi64(__A), - (__v8di) _mm512_setzero_si512()); -} - - -static __inline __m512 __DEFAULT_FN_ATTRS512 -_mm512_setzero_ps(void) -{ - return __extension__ (__m512){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, - 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 }; -} - -#define _mm512_setzero _mm512_setzero_ps - -static __inline __m512d __DEFAULT_FN_ATTRS512 -_mm512_setzero_pd(void) -{ - return __extension__ (__m512d){ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 }; -} - -static __inline __m512 __DEFAULT_FN_ATTRS512 -_mm512_set1_ps(float __w) -{ - return __extension__ (__m512){ __w, __w, __w, __w, __w, __w, __w, __w, - __w, __w, __w, __w, __w, __w, __w, __w }; -} - -static __inline __m512d __DEFAULT_FN_ATTRS512 -_mm512_set1_pd(double __w) -{ - return __extension__ (__m512d){ __w, __w, __w, __w, __w, __w, __w, __w }; -} - -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_set1_epi8(char __w) -{ - return __extension__ (__m512i)(__v64qi){ - __w, __w, __w, __w, __w, __w, __w, __w, - __w, __w, __w, __w, __w, __w, __w, __w, - __w, __w, __w, __w, __w, __w, __w, __w, - __w, __w, __w, __w, __w, __w, __w, __w, - __w, __w, __w, __w, __w, __w, __w, __w, - __w, __w, __w, __w, __w, __w, __w, __w, - __w, __w, __w, __w, __w, __w, __w, __w, - __w, __w, __w, __w, __w, __w, __w, __w }; -} - -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_set1_epi16(short __w) -{ - return __extension__ (__m512i)(__v32hi){ - __w, __w, __w, __w, __w, __w, __w, __w, - __w, __w, __w, __w, __w, __w, __w, __w, - __w, __w, __w, __w, __w, __w, __w, __w, - __w, __w, __w, __w, __w, __w, __w, __w }; -} - -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_set1_epi32(int __s) -{ - return __extension__ (__m512i)(__v16si){ - __s, __s, __s, __s, __s, __s, __s, __s, - __s, __s, __s, __s, __s, __s, __s, __s }; -} - -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_set1_epi32(__mmask16 __M, int __A) -{ - return (__m512i)__builtin_ia32_selectd_512(__M, - (__v16si)_mm512_set1_epi32(__A), - (__v16si)_mm512_setzero_si512()); -} - -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_set1_epi64(long long __d) -{ - return __extension__(__m512i)(__v8di){ __d, __d, __d, __d, __d, __d, __d, __d }; -} - -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_set1_epi64(__mmask8 __M, long long __A) -{ - return (__m512i)__builtin_ia32_selectq_512(__M, - (__v8di)_mm512_set1_epi64(__A), - (__v8di)_mm512_setzero_si512()); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_broadcastss_ps(__m128 __A) -{ - return (__m512)__builtin_shufflevector((__v4sf) __A, (__v4sf) __A, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); -} - -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_set4_epi32 (int __A, int __B, int __C, int __D) -{ - return __extension__ (__m512i)(__v16si) - { __D, __C, __B, __A, __D, __C, __B, __A, - __D, __C, __B, __A, __D, __C, __B, __A }; -} - -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_set4_epi64 (long long __A, long long __B, long long __C, - long long __D) -{ - return __extension__ (__m512i) (__v8di) - { __D, __C, __B, __A, __D, __C, __B, __A }; -} - -static __inline __m512d __DEFAULT_FN_ATTRS512 -_mm512_set4_pd (double __A, double __B, double __C, double __D) -{ - return __extension__ (__m512d) - { __D, __C, __B, __A, __D, __C, __B, __A }; -} - -static __inline __m512 __DEFAULT_FN_ATTRS512 -_mm512_set4_ps (float __A, float __B, float __C, float __D) -{ - return __extension__ (__m512) - { __D, __C, __B, __A, __D, __C, __B, __A, - __D, __C, __B, __A, __D, __C, __B, __A }; -} - -#define _mm512_setr4_epi32(e0,e1,e2,e3) \ - _mm512_set4_epi32((e3),(e2),(e1),(e0)) - -#define _mm512_setr4_epi64(e0,e1,e2,e3) \ - _mm512_set4_epi64((e3),(e2),(e1),(e0)) - -#define _mm512_setr4_pd(e0,e1,e2,e3) \ - _mm512_set4_pd((e3),(e2),(e1),(e0)) - -#define _mm512_setr4_ps(e0,e1,e2,e3) \ - _mm512_set4_ps((e3),(e2),(e1),(e0)) - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_broadcastsd_pd(__m128d __A) -{ - return (__m512d)__builtin_shufflevector((__v2df) __A, (__v2df) __A, - 0, 0, 0, 0, 0, 0, 0, 0); -} - -/* Cast between vector types */ - -static __inline __m512d __DEFAULT_FN_ATTRS512 -_mm512_castpd256_pd512(__m256d __a) -{ - return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, -1, -1, -1, -1); -} - -static __inline __m512 __DEFAULT_FN_ATTRS512 -_mm512_castps256_ps512(__m256 __a) -{ - return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, - -1, -1, -1, -1, -1, -1, -1, -1); -} - -static __inline __m128d __DEFAULT_FN_ATTRS512 -_mm512_castpd512_pd128(__m512d __a) -{ - return __builtin_shufflevector(__a, __a, 0, 1); -} - -static __inline __m256d __DEFAULT_FN_ATTRS512 -_mm512_castpd512_pd256 (__m512d __A) -{ - return __builtin_shufflevector(__A, __A, 0, 1, 2, 3); -} - -static __inline __m128 __DEFAULT_FN_ATTRS512 -_mm512_castps512_ps128(__m512 __a) -{ - return __builtin_shufflevector(__a, __a, 0, 1, 2, 3); -} - -static __inline __m256 __DEFAULT_FN_ATTRS512 -_mm512_castps512_ps256 (__m512 __A) -{ - return __builtin_shufflevector(__A, __A, 0, 1, 2, 3, 4, 5, 6, 7); -} - -static __inline __m512 __DEFAULT_FN_ATTRS512 -_mm512_castpd_ps (__m512d __A) -{ - return (__m512) (__A); -} - -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_castpd_si512 (__m512d __A) -{ - return (__m512i) (__A); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_castpd128_pd512 (__m128d __A) -{ - return __builtin_shufflevector( __A, __A, 0, 1, -1, -1, -1, -1, -1, -1); -} - -static __inline __m512d __DEFAULT_FN_ATTRS512 -_mm512_castps_pd (__m512 __A) -{ - return (__m512d) (__A); -} - -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_castps_si512 (__m512 __A) -{ - return (__m512i) (__A); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_castps128_ps512 (__m128 __A) -{ - return __builtin_shufflevector( __A, __A, 0, 1, 2, 3, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_castsi128_si512 (__m128i __A) -{ - return __builtin_shufflevector( __A, __A, 0, 1, -1, -1, -1, -1, -1, -1); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_castsi256_si512 (__m256i __A) -{ - return __builtin_shufflevector( __A, __A, 0, 1, 2, 3, -1, -1, -1, -1); -} - -static __inline __m512 __DEFAULT_FN_ATTRS512 -_mm512_castsi512_ps (__m512i __A) -{ - return (__m512) (__A); -} - -static __inline __m512d __DEFAULT_FN_ATTRS512 -_mm512_castsi512_pd (__m512i __A) -{ - return (__m512d) (__A); -} - -static __inline __m128i __DEFAULT_FN_ATTRS512 -_mm512_castsi512_si128 (__m512i __A) -{ - return (__m128i)__builtin_shufflevector(__A, __A , 0, 1); -} - -static __inline __m256i __DEFAULT_FN_ATTRS512 -_mm512_castsi512_si256 (__m512i __A) -{ - return (__m256i)__builtin_shufflevector(__A, __A , 0, 1, 2, 3); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm512_int2mask(int __a) -{ - return (__mmask16)__a; -} - -static __inline__ int __DEFAULT_FN_ATTRS -_mm512_mask2int(__mmask16 __a) -{ - return (int)__a; -} - -/// Constructs a 512-bit floating-point vector of [8 x double] from a -/// 128-bit floating-point vector of [2 x double]. The lower 128 bits -/// contain the value of the source vector. The upper 384 bits are set -/// to zero. -/// -/// \headerfile -/// -/// This intrinsic has no corresponding instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. -/// \returns A 512-bit floating-point vector of [8 x double]. The lower 128 bits -/// contain the value of the parameter. The upper 384 bits are set to zero. -static __inline __m512d __DEFAULT_FN_ATTRS512 -_mm512_zextpd128_pd512(__m128d __a) -{ - return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3, 2, 3, 2, 3); -} - -/// Constructs a 512-bit floating-point vector of [8 x double] from a -/// 256-bit floating-point vector of [4 x double]. The lower 256 bits -/// contain the value of the source vector. The upper 256 bits are set -/// to zero. -/// -/// \headerfile -/// -/// This intrinsic has no corresponding instruction. -/// -/// \param __a -/// A 256-bit vector of [4 x double]. -/// \returns A 512-bit floating-point vector of [8 x double]. The lower 256 bits -/// contain the value of the parameter. The upper 256 bits are set to zero. -static __inline __m512d __DEFAULT_FN_ATTRS512 -_mm512_zextpd256_pd512(__m256d __a) -{ - return __builtin_shufflevector((__v4df)__a, (__v4df)_mm256_setzero_pd(), 0, 1, 2, 3, 4, 5, 6, 7); -} - -/// Constructs a 512-bit floating-point vector of [16 x float] from a -/// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain -/// the value of the source vector. The upper 384 bits are set to zero. -/// -/// \headerfile -/// -/// This intrinsic has no corresponding instruction. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. -/// \returns A 512-bit floating-point vector of [16 x float]. The lower 128 bits -/// contain the value of the parameter. The upper 384 bits are set to zero. -static __inline __m512 __DEFAULT_FN_ATTRS512 -_mm512_zextps128_ps512(__m128 __a) -{ - return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7, 4, 5, 6, 7, 4, 5, 6, 7); -} - -/// Constructs a 512-bit floating-point vector of [16 x float] from a -/// 256-bit floating-point vector of [8 x float]. The lower 256 bits contain -/// the value of the source vector. The upper 256 bits are set to zero. -/// -/// \headerfile -/// -/// This intrinsic has no corresponding instruction. -/// -/// \param __a -/// A 256-bit vector of [8 x float]. -/// \returns A 512-bit floating-point vector of [16 x float]. The lower 256 bits -/// contain the value of the parameter. The upper 256 bits are set to zero. -static __inline __m512 __DEFAULT_FN_ATTRS512 -_mm512_zextps256_ps512(__m256 __a) -{ - return __builtin_shufflevector((__v8sf)__a, (__v8sf)_mm256_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); -} - -/// Constructs a 512-bit integer vector from a 128-bit integer vector. -/// The lower 128 bits contain the value of the source vector. The upper -/// 384 bits are set to zero. -/// -/// \headerfile -/// -/// This intrinsic has no corresponding instruction. -/// -/// \param __a -/// A 128-bit integer vector. -/// \returns A 512-bit integer vector. The lower 128 bits contain the value of -/// the parameter. The upper 384 bits are set to zero. -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_zextsi128_si512(__m128i __a) -{ - return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3, 2, 3, 2, 3); -} - -/// Constructs a 512-bit integer vector from a 256-bit integer vector. -/// The lower 256 bits contain the value of the source vector. The upper -/// 256 bits are set to zero. -/// -/// \headerfile -/// -/// This intrinsic has no corresponding instruction. -/// -/// \param __a -/// A 256-bit integer vector. -/// \returns A 512-bit integer vector. The lower 256 bits contain the value of -/// the parameter. The upper 256 bits are set to zero. -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_zextsi256_si512(__m256i __a) -{ - return __builtin_shufflevector((__v4di)__a, (__v4di)_mm256_setzero_si256(), 0, 1, 2, 3, 4, 5, 6, 7); -} - -/* Bitwise operators */ -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_and_epi32(__m512i __a, __m512i __b) -{ - return (__m512i)((__v16su)__a & (__v16su)__b); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_and_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b) -{ - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k, - (__v16si) _mm512_and_epi32(__a, __b), - (__v16si) __src); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_and_epi32(__mmask16 __k, __m512i __a, __m512i __b) -{ - return (__m512i) _mm512_mask_and_epi32(_mm512_setzero_si512 (), - __k, __a, __b); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_and_epi64(__m512i __a, __m512i __b) -{ - return (__m512i)((__v8du)__a & (__v8du)__b); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_and_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b) -{ - return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __k, - (__v8di) _mm512_and_epi64(__a, __b), - (__v8di) __src); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_and_epi64(__mmask8 __k, __m512i __a, __m512i __b) -{ - return (__m512i) _mm512_mask_and_epi64(_mm512_setzero_si512 (), - __k, __a, __b); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_andnot_si512 (__m512i __A, __m512i __B) -{ - return (__m512i)(~(__v8du)__A & (__v8du)__B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_andnot_epi32 (__m512i __A, __m512i __B) -{ - return (__m512i)(~(__v16su)__A & (__v16su)__B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_andnot_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_andnot_epi32(__A, __B), - (__v16si)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_andnot_epi32(__mmask16 __U, __m512i __A, __m512i __B) -{ - return (__m512i)_mm512_mask_andnot_epi32(_mm512_setzero_si512(), - __U, __A, __B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_andnot_epi64(__m512i __A, __m512i __B) -{ - return (__m512i)(~(__v8du)__A & (__v8du)__B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_andnot_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_andnot_epi64(__A, __B), - (__v8di)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_andnot_epi64(__mmask8 __U, __m512i __A, __m512i __B) -{ - return (__m512i)_mm512_mask_andnot_epi64(_mm512_setzero_si512(), - __U, __A, __B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_or_epi32(__m512i __a, __m512i __b) -{ - return (__m512i)((__v16su)__a | (__v16su)__b); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_or_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b) -{ - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k, - (__v16si)_mm512_or_epi32(__a, __b), - (__v16si)__src); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_or_epi32(__mmask16 __k, __m512i __a, __m512i __b) -{ - return (__m512i)_mm512_mask_or_epi32(_mm512_setzero_si512(), __k, __a, __b); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_or_epi64(__m512i __a, __m512i __b) -{ - return (__m512i)((__v8du)__a | (__v8du)__b); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_or_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__k, - (__v8di)_mm512_or_epi64(__a, __b), - (__v8di)__src); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_or_epi64(__mmask8 __k, __m512i __a, __m512i __b) -{ - return (__m512i)_mm512_mask_or_epi64(_mm512_setzero_si512(), __k, __a, __b); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_xor_epi32(__m512i __a, __m512i __b) -{ - return (__m512i)((__v16su)__a ^ (__v16su)__b); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_xor_epi32(__m512i __src, __mmask16 __k, __m512i __a, __m512i __b) -{ - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__k, - (__v16si)_mm512_xor_epi32(__a, __b), - (__v16si)__src); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_xor_epi32(__mmask16 __k, __m512i __a, __m512i __b) -{ - return (__m512i)_mm512_mask_xor_epi32(_mm512_setzero_si512(), __k, __a, __b); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_xor_epi64(__m512i __a, __m512i __b) -{ - return (__m512i)((__v8du)__a ^ (__v8du)__b); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_xor_epi64(__m512i __src, __mmask8 __k, __m512i __a, __m512i __b) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__k, - (__v8di)_mm512_xor_epi64(__a, __b), - (__v8di)__src); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_xor_epi64(__mmask8 __k, __m512i __a, __m512i __b) -{ - return (__m512i)_mm512_mask_xor_epi64(_mm512_setzero_si512(), __k, __a, __b); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_and_si512(__m512i __a, __m512i __b) -{ - return (__m512i)((__v8du)__a & (__v8du)__b); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_or_si512(__m512i __a, __m512i __b) -{ - return (__m512i)((__v8du)__a | (__v8du)__b); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_xor_si512(__m512i __a, __m512i __b) -{ - return (__m512i)((__v8du)__a ^ (__v8du)__b); -} - -/* Arithmetic */ - -static __inline __m512d __DEFAULT_FN_ATTRS512 -_mm512_add_pd(__m512d __a, __m512d __b) -{ - return (__m512d)((__v8df)__a + (__v8df)__b); -} - -static __inline __m512 __DEFAULT_FN_ATTRS512 -_mm512_add_ps(__m512 __a, __m512 __b) -{ - return (__m512)((__v16sf)__a + (__v16sf)__b); -} - -static __inline __m512d __DEFAULT_FN_ATTRS512 -_mm512_mul_pd(__m512d __a, __m512d __b) -{ - return (__m512d)((__v8df)__a * (__v8df)__b); -} - -static __inline __m512 __DEFAULT_FN_ATTRS512 -_mm512_mul_ps(__m512 __a, __m512 __b) -{ - return (__m512)((__v16sf)__a * (__v16sf)__b); -} - -static __inline __m512d __DEFAULT_FN_ATTRS512 -_mm512_sub_pd(__m512d __a, __m512d __b) -{ - return (__m512d)((__v8df)__a - (__v8df)__b); -} - -static __inline __m512 __DEFAULT_FN_ATTRS512 -_mm512_sub_ps(__m512 __a, __m512 __b) -{ - return (__m512)((__v16sf)__a - (__v16sf)__b); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_add_epi64 (__m512i __A, __m512i __B) -{ - return (__m512i) ((__v8du) __A + (__v8du) __B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_add_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_add_epi64(__A, __B), - (__v8di)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_add_epi64(__mmask8 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_add_epi64(__A, __B), - (__v8di)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_sub_epi64 (__m512i __A, __m512i __B) -{ - return (__m512i) ((__v8du) __A - (__v8du) __B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_sub_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_sub_epi64(__A, __B), - (__v8di)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_sub_epi64(__mmask8 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_sub_epi64(__A, __B), - (__v8di)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_add_epi32 (__m512i __A, __m512i __B) -{ - return (__m512i) ((__v16su) __A + (__v16su) __B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_add_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_add_epi32(__A, __B), - (__v16si)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_add_epi32 (__mmask16 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_add_epi32(__A, __B), - (__v16si)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_sub_epi32 (__m512i __A, __m512i __B) -{ - return (__m512i) ((__v16su) __A - (__v16su) __B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_sub_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_sub_epi32(__A, __B), - (__v16si)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_sub_epi32(__mmask16 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_sub_epi32(__A, __B), - (__v16si)_mm512_setzero_si512()); -} - -#define _mm512_max_round_pd(A, B, R) \ - ((__m512d)__builtin_ia32_maxpd512((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), (int)(R))) - -#define _mm512_mask_max_round_pd(W, U, A, B, R) \ - ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_max_round_pd((A), (B), (R)), \ - (__v8df)(W))) - -#define _mm512_maskz_max_round_pd(U, A, B, R) \ - ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_max_round_pd((A), (B), (R)), \ - (__v8df)_mm512_setzero_pd())) - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_max_pd(__m512d __A, __m512d __B) -{ - return (__m512d) __builtin_ia32_maxpd512((__v8df) __A, (__v8df) __B, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_max_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) -{ - return (__m512d)__builtin_ia32_selectpd_512(__U, - (__v8df)_mm512_max_pd(__A, __B), - (__v8df)__W); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_maskz_max_pd (__mmask8 __U, __m512d __A, __m512d __B) -{ - return (__m512d)__builtin_ia32_selectpd_512(__U, - (__v8df)_mm512_max_pd(__A, __B), - (__v8df)_mm512_setzero_pd()); -} - -#define _mm512_max_round_ps(A, B, R) \ - ((__m512)__builtin_ia32_maxps512((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), (int)(R))) - -#define _mm512_mask_max_round_ps(W, U, A, B, R) \ - ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - (__v16sf)_mm512_max_round_ps((A), (B), (R)), \ - (__v16sf)(W))) - -#define _mm512_maskz_max_round_ps(U, A, B, R) \ - ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - (__v16sf)_mm512_max_round_ps((A), (B), (R)), \ - (__v16sf)_mm512_setzero_ps())) - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_max_ps(__m512 __A, __m512 __B) -{ - return (__m512) __builtin_ia32_maxps512((__v16sf) __A, (__v16sf) __B, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_max_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) -{ - return (__m512)__builtin_ia32_selectps_512(__U, - (__v16sf)_mm512_max_ps(__A, __B), - (__v16sf)__W); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_maskz_max_ps (__mmask16 __U, __m512 __A, __m512 __B) -{ - return (__m512)__builtin_ia32_selectps_512(__U, - (__v16sf)_mm512_max_ps(__A, __B), - (__v16sf)_mm512_setzero_ps()); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_max_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_maxss_round_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_max_ss(__mmask8 __U,__m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_maxss_round_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) _mm_setzero_ps (), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_max_round_ss(A, B, R) \ - ((__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)-1, (int)(R))) - -#define _mm_mask_max_round_ss(W, U, A, B, R) \ - ((__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)(__m128)(W), (__mmask8)(U), \ - (int)(R))) - -#define _mm_maskz_max_round_ss(U, A, B, R) \ - ((__m128)__builtin_ia32_maxss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U), (int)(R))) - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_max_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { - return (__m128d) __builtin_ia32_maxsd_round_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_max_sd(__mmask8 __U,__m128d __A, __m128d __B) { - return (__m128d) __builtin_ia32_maxsd_round_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) _mm_setzero_pd (), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_max_round_sd(A, B, R) \ - ((__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1, (int)(R))) - -#define _mm_mask_max_round_sd(W, U, A, B, R) \ - ((__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)(__m128d)(W), \ - (__mmask8)(U), (int)(R))) - -#define _mm_maskz_max_round_sd(U, A, B, R) \ - ((__m128d)__builtin_ia32_maxsd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U), (int)(R))) - -static __inline __m512i -__DEFAULT_FN_ATTRS512 -_mm512_max_epi32(__m512i __A, __m512i __B) -{ -#if (__clang_major__ < 14) - return (__m512i)__builtin_ia32_pmaxsd512((__v16si)__A, (__v16si)__B); -#else - return (__m512i)__builtin_elementwise_max((__v16si)__A, (__v16si)__B); -#endif -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_max_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, - (__v16si)_mm512_max_epi32(__A, __B), - (__v16si)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_max_epi32 (__mmask16 __M, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, - (__v16si)_mm512_max_epi32(__A, __B), - (__v16si)_mm512_setzero_si512()); -} - -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_max_epu32(__m512i __A, __m512i __B) -{ -#if (__clang_major__ < 14) - return (__m512i)__builtin_ia32_pmaxud512((__v16si)__A, (__v16si)__B); -#else - return (__m512i)__builtin_elementwise_max((__v16su)__A, (__v16su)__B); -#endif -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_max_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, - (__v16si)_mm512_max_epu32(__A, __B), - (__v16si)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_max_epu32 (__mmask16 __M, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, - (__v16si)_mm512_max_epu32(__A, __B), - (__v16si)_mm512_setzero_si512()); -} - -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_max_epi64(__m512i __A, __m512i __B) -{ -#if (__clang_major__ < 14) - return (__m512i)__builtin_ia32_pmaxsq512((__v8di)__A, (__v8di)__B); -#else - return (__m512i)__builtin_elementwise_max((__v8di)__A, (__v8di)__B); -#endif -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_max_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, - (__v8di)_mm512_max_epi64(__A, __B), - (__v8di)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_max_epi64 (__mmask8 __M, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, - (__v8di)_mm512_max_epi64(__A, __B), - (__v8di)_mm512_setzero_si512()); -} - -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_max_epu64(__m512i __A, __m512i __B) -{ -#if (__clang_major__ < 14) - return (__m512i)__builtin_ia32_pmaxuq512((__v8di)__A, (__v8di)__B); -#else - return (__m512i)__builtin_elementwise_max((__v8du)__A, (__v8du)__B); -#endif -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_max_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, - (__v8di)_mm512_max_epu64(__A, __B), - (__v8di)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_max_epu64 (__mmask8 __M, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, - (__v8di)_mm512_max_epu64(__A, __B), - (__v8di)_mm512_setzero_si512()); -} - -#define _mm512_min_round_pd(A, B, R) \ - ((__m512d)__builtin_ia32_minpd512((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), (int)(R))) - -#define _mm512_mask_min_round_pd(W, U, A, B, R) \ - ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_min_round_pd((A), (B), (R)), \ - (__v8df)(W))) - -#define _mm512_maskz_min_round_pd(U, A, B, R) \ - ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_min_round_pd((A), (B), (R)), \ - (__v8df)_mm512_setzero_pd())) - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_min_pd(__m512d __A, __m512d __B) -{ - return (__m512d) __builtin_ia32_minpd512((__v8df) __A, (__v8df) __B, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_min_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) -{ - return (__m512d)__builtin_ia32_selectpd_512(__U, - (__v8df)_mm512_min_pd(__A, __B), - (__v8df)__W); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_maskz_min_pd (__mmask8 __U, __m512d __A, __m512d __B) -{ - return (__m512d)__builtin_ia32_selectpd_512(__U, - (__v8df)_mm512_min_pd(__A, __B), - (__v8df)_mm512_setzero_pd()); -} - -#define _mm512_min_round_ps(A, B, R) \ - ((__m512)__builtin_ia32_minps512((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), (int)(R))) - -#define _mm512_mask_min_round_ps(W, U, A, B, R) \ - ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - (__v16sf)_mm512_min_round_ps((A), (B), (R)), \ - (__v16sf)(W))) - -#define _mm512_maskz_min_round_ps(U, A, B, R) \ - ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - (__v16sf)_mm512_min_round_ps((A), (B), (R)), \ - (__v16sf)_mm512_setzero_ps())) - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_min_ps(__m512 __A, __m512 __B) -{ - return (__m512) __builtin_ia32_minps512((__v16sf) __A, (__v16sf) __B, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_min_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) -{ - return (__m512)__builtin_ia32_selectps_512(__U, - (__v16sf)_mm512_min_ps(__A, __B), - (__v16sf)__W); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_maskz_min_ps (__mmask16 __U, __m512 __A, __m512 __B) -{ - return (__m512)__builtin_ia32_selectps_512(__U, - (__v16sf)_mm512_min_ps(__A, __B), - (__v16sf)_mm512_setzero_ps()); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_min_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_minss_round_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_min_ss(__mmask8 __U,__m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_minss_round_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) _mm_setzero_ps (), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_min_round_ss(A, B, R) \ - ((__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)-1, (int)(R))) - -#define _mm_mask_min_round_ss(W, U, A, B, R) \ - ((__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)(__m128)(W), (__mmask8)(U), \ - (int)(R))) - -#define _mm_maskz_min_round_ss(U, A, B, R) \ - ((__m128)__builtin_ia32_minss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U), (int)(R))) - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_min_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { - return (__m128d) __builtin_ia32_minsd_round_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_min_sd(__mmask8 __U,__m128d __A, __m128d __B) { - return (__m128d) __builtin_ia32_minsd_round_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) _mm_setzero_pd (), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_min_round_sd(A, B, R) \ - ((__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1, (int)(R))) - -#define _mm_mask_min_round_sd(W, U, A, B, R) \ - ((__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)(__m128d)(W), \ - (__mmask8)(U), (int)(R))) - -#define _mm_maskz_min_round_sd(U, A, B, R) \ - ((__m128d)__builtin_ia32_minsd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U), (int)(R))) - -static __inline __m512i -__DEFAULT_FN_ATTRS512 -_mm512_min_epi32(__m512i __A, __m512i __B) -{ -#if (__clang_major__ < 14) - return (__m512i)__builtin_ia32_pminsd512((__v16si)__A, (__v16si)__B); -#else - return (__m512i)__builtin_elementwise_min((__v16si)__A, (__v16si)__B); -#endif -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_min_epi32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, - (__v16si)_mm512_min_epi32(__A, __B), - (__v16si)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_min_epi32 (__mmask16 __M, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, - (__v16si)_mm512_min_epi32(__A, __B), - (__v16si)_mm512_setzero_si512()); -} - -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_min_epu32(__m512i __A, __m512i __B) -{ -#if (__clang_major__ < 14) - return (__m512i)__builtin_ia32_pminud512((__v16si)__A, (__v16si)__B); -#else - return (__m512i)__builtin_elementwise_min((__v16su)__A, (__v16su)__B); -#endif -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_min_epu32 (__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, - (__v16si)_mm512_min_epu32(__A, __B), - (__v16si)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_min_epu32 (__mmask16 __M, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, - (__v16si)_mm512_min_epu32(__A, __B), - (__v16si)_mm512_setzero_si512()); -} - -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_min_epi64(__m512i __A, __m512i __B) -{ -#if (__clang_major__ < 14) - return (__m512i)__builtin_ia32_pminsq512((__v8di)__A, (__v8di)__B); -#else - return (__m512i)__builtin_elementwise_min((__v8di)__A, (__v8di)__B); -#endif -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_min_epi64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, - (__v8di)_mm512_min_epi64(__A, __B), - (__v8di)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_min_epi64 (__mmask8 __M, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, - (__v8di)_mm512_min_epi64(__A, __B), - (__v8di)_mm512_setzero_si512()); -} - -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_min_epu64(__m512i __A, __m512i __B) -{ -#if (__clang_major__ < 14) - return (__m512i)__builtin_ia32_pminuq512((__v8di)__A, (__v8di)__B); -#else - return (__m512i)__builtin_elementwise_min((__v8du)__A, (__v8du)__B); -#endif -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_min_epu64 (__m512i __W, __mmask8 __M, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, - (__v8di)_mm512_min_epu64(__A, __B), - (__v8di)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_min_epu64 (__mmask8 __M, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, - (__v8di)_mm512_min_epu64(__A, __B), - (__v8di)_mm512_setzero_si512()); -} - -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_mul_epi32(__m512i __X, __m512i __Y) -{ - return (__m512i)__builtin_ia32_pmuldq512((__v16si)__X, (__v16si) __Y); -} - -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_mul_epi32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, - (__v8di)_mm512_mul_epi32(__X, __Y), - (__v8di)__W); -} - -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_mul_epi32(__mmask8 __M, __m512i __X, __m512i __Y) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, - (__v8di)_mm512_mul_epi32(__X, __Y), - (__v8di)_mm512_setzero_si512 ()); -} - -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_mul_epu32(__m512i __X, __m512i __Y) -{ - return (__m512i)__builtin_ia32_pmuludq512((__v16si)__X, (__v16si)__Y); -} - -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_mul_epu32(__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, - (__v8di)_mm512_mul_epu32(__X, __Y), - (__v8di)__W); -} - -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_mul_epu32(__mmask8 __M, __m512i __X, __m512i __Y) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, - (__v8di)_mm512_mul_epu32(__X, __Y), - (__v8di)_mm512_setzero_si512 ()); -} - -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_mullo_epi32 (__m512i __A, __m512i __B) -{ - return (__m512i) ((__v16su) __A * (__v16su) __B); -} - -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_mullo_epi32(__mmask16 __M, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, - (__v16si)_mm512_mullo_epi32(__A, __B), - (__v16si)_mm512_setzero_si512()); -} - -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_mullo_epi32(__m512i __W, __mmask16 __M, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, - (__v16si)_mm512_mullo_epi32(__A, __B), - (__v16si)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mullox_epi64 (__m512i __A, __m512i __B) { - return (__m512i) ((__v8du) __A * (__v8du) __B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_mullox_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) { - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_mullox_epi64(__A, __B), - (__v8di)__W); -} - -#define _mm512_sqrt_round_pd(A, R) \ - ((__m512d)__builtin_ia32_sqrtpd512((__v8df)(__m512d)(A), (int)(R))) - -#define _mm512_mask_sqrt_round_pd(W, U, A, R) \ - ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_sqrt_round_pd((A), (R)), \ - (__v8df)(__m512d)(W))) - -#define _mm512_maskz_sqrt_round_pd(U, A, R) \ - ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_sqrt_round_pd((A), (R)), \ - (__v8df)_mm512_setzero_pd())) - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_sqrt_pd(__m512d __A) -{ - return (__m512d)__builtin_ia32_sqrtpd512((__v8df)__A, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_sqrt_pd (__m512d __W, __mmask8 __U, __m512d __A) -{ - return (__m512d)__builtin_ia32_selectpd_512(__U, - (__v8df)_mm512_sqrt_pd(__A), - (__v8df)__W); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_maskz_sqrt_pd (__mmask8 __U, __m512d __A) -{ - return (__m512d)__builtin_ia32_selectpd_512(__U, - (__v8df)_mm512_sqrt_pd(__A), - (__v8df)_mm512_setzero_pd()); -} - -#define _mm512_sqrt_round_ps(A, R) \ - ((__m512)__builtin_ia32_sqrtps512((__v16sf)(__m512)(A), (int)(R))) - -#define _mm512_mask_sqrt_round_ps(W, U, A, R) \ - ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - (__v16sf)_mm512_sqrt_round_ps((A), (R)), \ - (__v16sf)(__m512)(W))) - -#define _mm512_maskz_sqrt_round_ps(U, A, R) \ - ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - (__v16sf)_mm512_sqrt_round_ps((A), (R)), \ - (__v16sf)_mm512_setzero_ps())) - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_sqrt_ps(__m512 __A) -{ - return (__m512)__builtin_ia32_sqrtps512((__v16sf)__A, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_sqrt_ps(__m512 __W, __mmask16 __U, __m512 __A) -{ - return (__m512)__builtin_ia32_selectps_512(__U, - (__v16sf)_mm512_sqrt_ps(__A), - (__v16sf)__W); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_maskz_sqrt_ps( __mmask16 __U, __m512 __A) -{ - return (__m512)__builtin_ia32_selectps_512(__U, - (__v16sf)_mm512_sqrt_ps(__A), - (__v16sf)_mm512_setzero_ps()); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_rsqrt14_pd(__m512d __A) -{ - return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A, - (__v8df) - _mm512_setzero_pd (), - (__mmask8) -1);} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_rsqrt14_pd (__m512d __W, __mmask8 __U, __m512d __A) -{ - return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A, - (__v8df) __W, - (__mmask8) __U); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_maskz_rsqrt14_pd (__mmask8 __U, __m512d __A) -{ - return (__m512d) __builtin_ia32_rsqrt14pd512_mask ((__v8df) __A, - (__v8df) - _mm512_setzero_pd (), - (__mmask8) __U); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_rsqrt14_ps(__m512 __A) -{ - return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A, - (__v16sf) - _mm512_setzero_ps (), - (__mmask16) -1); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_rsqrt14_ps (__m512 __W, __mmask16 __U, __m512 __A) -{ - return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A, - (__v16sf) __W, - (__mmask16) __U); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_maskz_rsqrt14_ps (__mmask16 __U, __m512 __A) -{ - return (__m512) __builtin_ia32_rsqrt14ps512_mask ((__v16sf) __A, - (__v16sf) - _mm512_setzero_ps (), - (__mmask16) __U); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_rsqrt14_ss(__m128 __A, __m128 __B) -{ - return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) -1); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_rsqrt14_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) -{ - return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __W, - (__mmask8) __U); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_rsqrt14_ss (__mmask8 __U, __m128 __A, __m128 __B) -{ - return (__m128) __builtin_ia32_rsqrt14ss_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) _mm_setzero_ps (), - (__mmask8) __U); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_rsqrt14_sd(__m128d __A, __m128d __B) -{ - return (__m128d) __builtin_ia32_rsqrt14sd_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) - _mm_setzero_pd (), - (__mmask8) -1); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_rsqrt14_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) -{ - return (__m128d) __builtin_ia32_rsqrt14sd_mask ( (__v2df) __A, - (__v2df) __B, - (__v2df) __W, - (__mmask8) __U); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_rsqrt14_sd (__mmask8 __U, __m128d __A, __m128d __B) -{ - return (__m128d) __builtin_ia32_rsqrt14sd_mask ( (__v2df) __A, - (__v2df) __B, - (__v2df) _mm_setzero_pd (), - (__mmask8) __U); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_rcp14_pd(__m512d __A) -{ - return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A, - (__v8df) - _mm512_setzero_pd (), - (__mmask8) -1); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_rcp14_pd (__m512d __W, __mmask8 __U, __m512d __A) -{ - return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A, - (__v8df) __W, - (__mmask8) __U); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_maskz_rcp14_pd (__mmask8 __U, __m512d __A) -{ - return (__m512d) __builtin_ia32_rcp14pd512_mask ((__v8df) __A, - (__v8df) - _mm512_setzero_pd (), - (__mmask8) __U); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_rcp14_ps(__m512 __A) -{ - return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A, - (__v16sf) - _mm512_setzero_ps (), - (__mmask16) -1); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_rcp14_ps (__m512 __W, __mmask16 __U, __m512 __A) -{ - return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A, - (__v16sf) __W, - (__mmask16) __U); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_maskz_rcp14_ps (__mmask16 __U, __m512 __A) -{ - return (__m512) __builtin_ia32_rcp14ps512_mask ((__v16sf) __A, - (__v16sf) - _mm512_setzero_ps (), - (__mmask16) __U); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_rcp14_ss(__m128 __A, __m128 __B) -{ - return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) -1); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_rcp14_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) -{ - return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __W, - (__mmask8) __U); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_rcp14_ss (__mmask8 __U, __m128 __A, __m128 __B) -{ - return (__m128) __builtin_ia32_rcp14ss_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) _mm_setzero_ps (), - (__mmask8) __U); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_rcp14_sd(__m128d __A, __m128d __B) -{ - return (__m128d) __builtin_ia32_rcp14sd_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) - _mm_setzero_pd (), - (__mmask8) -1); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_rcp14_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) -{ - return (__m128d) __builtin_ia32_rcp14sd_mask ( (__v2df) __A, - (__v2df) __B, - (__v2df) __W, - (__mmask8) __U); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_rcp14_sd (__mmask8 __U, __m128d __A, __m128d __B) -{ - return (__m128d) __builtin_ia32_rcp14sd_mask ( (__v2df) __A, - (__v2df) __B, - (__v2df) _mm_setzero_pd (), - (__mmask8) __U); -} - -static __inline __m512 __DEFAULT_FN_ATTRS512 -_mm512_floor_ps(__m512 __A) -{ - return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, - _MM_FROUND_FLOOR, - (__v16sf) __A, -1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_floor_ps (__m512 __W, __mmask16 __U, __m512 __A) -{ - return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, - _MM_FROUND_FLOOR, - (__v16sf) __W, __U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline __m512d __DEFAULT_FN_ATTRS512 -_mm512_floor_pd(__m512d __A) -{ - return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, - _MM_FROUND_FLOOR, - (__v8df) __A, -1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_floor_pd (__m512d __W, __mmask8 __U, __m512d __A) -{ - return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, - _MM_FROUND_FLOOR, - (__v8df) __W, __U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_ceil_ps (__m512 __W, __mmask16 __U, __m512 __A) -{ - return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, - _MM_FROUND_CEIL, - (__v16sf) __W, __U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline __m512 __DEFAULT_FN_ATTRS512 -_mm512_ceil_ps(__m512 __A) -{ - return (__m512) __builtin_ia32_rndscaleps_mask ((__v16sf) __A, - _MM_FROUND_CEIL, - (__v16sf) __A, -1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline __m512d __DEFAULT_FN_ATTRS512 -_mm512_ceil_pd(__m512d __A) -{ - return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, - _MM_FROUND_CEIL, - (__v8df) __A, -1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_ceil_pd (__m512d __W, __mmask8 __U, __m512d __A) -{ - return (__m512d) __builtin_ia32_rndscalepd_mask ((__v8df) __A, - _MM_FROUND_CEIL, - (__v8df) __W, __U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_abs_epi64(__m512i __A) -{ -#if (__clang_major__ < 14) - return (__m512i)__builtin_ia32_pabsq512((__v8di)__A); -#else - return (__m512i)__builtin_elementwise_abs((__v8di)__A); -#endif -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_abs_epi64 (__m512i __W, __mmask8 __U, __m512i __A) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_abs_epi64(__A), - (__v8di)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_abs_epi64 (__mmask8 __U, __m512i __A) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_abs_epi64(__A), - (__v8di)_mm512_setzero_si512()); -} - -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_abs_epi32(__m512i __A) -{ -#if (__clang_major__ < 14) - return (__m512i)__builtin_ia32_pabsd512((__v16si) __A); -#else - return (__m512i)__builtin_elementwise_abs((__v16si) __A); -#endif -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_abs_epi32 (__m512i __W, __mmask16 __U, __m512i __A) -{ - return (__m512i)__builtin_ia32_selectd_512(__U, - (__v16si)_mm512_abs_epi32(__A), - (__v16si)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_abs_epi32 (__mmask16 __U, __m512i __A) -{ - return (__m512i)__builtin_ia32_selectd_512(__U, - (__v16si)_mm512_abs_epi32(__A), - (__v16si)_mm512_setzero_si512()); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_add_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { - __A = _mm_add_ss(__A, __B); - return __builtin_ia32_selectss_128(__U, __A, __W); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_add_ss(__mmask8 __U,__m128 __A, __m128 __B) { - __A = _mm_add_ss(__A, __B); - return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps()); -} - -#define _mm_add_round_ss(A, B, R) \ - ((__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)-1, (int)(R))) - -#define _mm_mask_add_round_ss(W, U, A, B, R) \ - ((__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)(__m128)(W), (__mmask8)(U), \ - (int)(R))) - -#define _mm_maskz_add_round_ss(U, A, B, R) \ - ((__m128)__builtin_ia32_addss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U), (int)(R))) - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_add_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { - __A = _mm_add_sd(__A, __B); - return __builtin_ia32_selectsd_128(__U, __A, __W); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_add_sd(__mmask8 __U,__m128d __A, __m128d __B) { - __A = _mm_add_sd(__A, __B); - return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd()); -} -#define _mm_add_round_sd(A, B, R) \ - ((__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1, (int)(R))) - -#define _mm_mask_add_round_sd(W, U, A, B, R) \ - ((__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)(__m128d)(W), \ - (__mmask8)(U), (int)(R))) - -#define _mm_maskz_add_round_sd(U, A, B, R) \ - ((__m128d)__builtin_ia32_addsd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U), (int)(R))) - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_add_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, - (__v8df)_mm512_add_pd(__A, __B), - (__v8df)__W); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_maskz_add_pd(__mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, - (__v8df)_mm512_add_pd(__A, __B), - (__v8df)_mm512_setzero_pd()); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_add_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_add_ps(__A, __B), - (__v16sf)__W); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_maskz_add_ps(__mmask16 __U, __m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_add_ps(__A, __B), - (__v16sf)_mm512_setzero_ps()); -} - -#define _mm512_add_round_pd(A, B, R) \ - ((__m512d)__builtin_ia32_addpd512((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), (int)(R))) - -#define _mm512_mask_add_round_pd(W, U, A, B, R) \ - ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_add_round_pd((A), (B), (R)), \ - (__v8df)(__m512d)(W))) - -#define _mm512_maskz_add_round_pd(U, A, B, R) \ - ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_add_round_pd((A), (B), (R)), \ - (__v8df)_mm512_setzero_pd())) - -#define _mm512_add_round_ps(A, B, R) \ - ((__m512)__builtin_ia32_addps512((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), (int)(R))) - -#define _mm512_mask_add_round_ps(W, U, A, B, R) \ - ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - (__v16sf)_mm512_add_round_ps((A), (B), (R)), \ - (__v16sf)(__m512)(W))) - -#define _mm512_maskz_add_round_ps(U, A, B, R) \ - ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - (__v16sf)_mm512_add_round_ps((A), (B), (R)), \ - (__v16sf)_mm512_setzero_ps())) - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_sub_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { - __A = _mm_sub_ss(__A, __B); - return __builtin_ia32_selectss_128(__U, __A, __W); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_sub_ss(__mmask8 __U,__m128 __A, __m128 __B) { - __A = _mm_sub_ss(__A, __B); - return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps()); -} -#define _mm_sub_round_ss(A, B, R) \ - ((__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)-1, (int)(R))) - -#define _mm_mask_sub_round_ss(W, U, A, B, R) \ - ((__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)(__m128)(W), (__mmask8)(U), \ - (int)(R))) - -#define _mm_maskz_sub_round_ss(U, A, B, R) \ - ((__m128)__builtin_ia32_subss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U), (int)(R))) - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_sub_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { - __A = _mm_sub_sd(__A, __B); - return __builtin_ia32_selectsd_128(__U, __A, __W); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_sub_sd(__mmask8 __U,__m128d __A, __m128d __B) { - __A = _mm_sub_sd(__A, __B); - return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd()); -} - -#define _mm_sub_round_sd(A, B, R) \ - ((__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1, (int)(R))) - -#define _mm_mask_sub_round_sd(W, U, A, B, R) \ - ((__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)(__m128d)(W), \ - (__mmask8)(U), (int)(R))) - -#define _mm_maskz_sub_round_sd(U, A, B, R) \ - ((__m128d)__builtin_ia32_subsd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U), (int)(R))) - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_sub_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, - (__v8df)_mm512_sub_pd(__A, __B), - (__v8df)__W); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_maskz_sub_pd(__mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, - (__v8df)_mm512_sub_pd(__A, __B), - (__v8df)_mm512_setzero_pd()); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_sub_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_sub_ps(__A, __B), - (__v16sf)__W); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_maskz_sub_ps(__mmask16 __U, __m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_sub_ps(__A, __B), - (__v16sf)_mm512_setzero_ps()); -} - -#define _mm512_sub_round_pd(A, B, R) \ - ((__m512d)__builtin_ia32_subpd512((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), (int)(R))) - -#define _mm512_mask_sub_round_pd(W, U, A, B, R) \ - ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_sub_round_pd((A), (B), (R)), \ - (__v8df)(__m512d)(W))) - -#define _mm512_maskz_sub_round_pd(U, A, B, R) \ - ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_sub_round_pd((A), (B), (R)), \ - (__v8df)_mm512_setzero_pd())) - -#define _mm512_sub_round_ps(A, B, R) \ - ((__m512)__builtin_ia32_subps512((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), (int)(R))) - -#define _mm512_mask_sub_round_ps(W, U, A, B, R) \ - ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - (__v16sf)_mm512_sub_round_ps((A), (B), (R)), \ - (__v16sf)(__m512)(W))) - -#define _mm512_maskz_sub_round_ps(U, A, B, R) \ - ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - (__v16sf)_mm512_sub_round_ps((A), (B), (R)), \ - (__v16sf)_mm512_setzero_ps())) - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_mul_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { - __A = _mm_mul_ss(__A, __B); - return __builtin_ia32_selectss_128(__U, __A, __W); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_mul_ss(__mmask8 __U,__m128 __A, __m128 __B) { - __A = _mm_mul_ss(__A, __B); - return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps()); -} -#define _mm_mul_round_ss(A, B, R) \ - ((__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)-1, (int)(R))) - -#define _mm_mask_mul_round_ss(W, U, A, B, R) \ - ((__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)(__m128)(W), (__mmask8)(U), \ - (int)(R))) - -#define _mm_maskz_mul_round_ss(U, A, B, R) \ - ((__m128)__builtin_ia32_mulss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U), (int)(R))) - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_mul_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { - __A = _mm_mul_sd(__A, __B); - return __builtin_ia32_selectsd_128(__U, __A, __W); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_mul_sd(__mmask8 __U,__m128d __A, __m128d __B) { - __A = _mm_mul_sd(__A, __B); - return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd()); -} - -#define _mm_mul_round_sd(A, B, R) \ - ((__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1, (int)(R))) - -#define _mm_mask_mul_round_sd(W, U, A, B, R) \ - ((__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)(__m128d)(W), \ - (__mmask8)(U), (int)(R))) - -#define _mm_maskz_mul_round_sd(U, A, B, R) \ - ((__m128d)__builtin_ia32_mulsd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U), (int)(R))) - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_mul_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, - (__v8df)_mm512_mul_pd(__A, __B), - (__v8df)__W); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_maskz_mul_pd(__mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, - (__v8df)_mm512_mul_pd(__A, __B), - (__v8df)_mm512_setzero_pd()); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_mul_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_mul_ps(__A, __B), - (__v16sf)__W); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_maskz_mul_ps(__mmask16 __U, __m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_mul_ps(__A, __B), - (__v16sf)_mm512_setzero_ps()); -} - -#define _mm512_mul_round_pd(A, B, R) \ - ((__m512d)__builtin_ia32_mulpd512((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), (int)(R))) - -#define _mm512_mask_mul_round_pd(W, U, A, B, R) \ - ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_mul_round_pd((A), (B), (R)), \ - (__v8df)(__m512d)(W))) - -#define _mm512_maskz_mul_round_pd(U, A, B, R) \ - ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_mul_round_pd((A), (B), (R)), \ - (__v8df)_mm512_setzero_pd())) - -#define _mm512_mul_round_ps(A, B, R) \ - ((__m512)__builtin_ia32_mulps512((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), (int)(R))) - -#define _mm512_mask_mul_round_ps(W, U, A, B, R) \ - ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - (__v16sf)_mm512_mul_round_ps((A), (B), (R)), \ - (__v16sf)(__m512)(W))) - -#define _mm512_maskz_mul_round_ps(U, A, B, R) \ - ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - (__v16sf)_mm512_mul_round_ps((A), (B), (R)), \ - (__v16sf)_mm512_setzero_ps())) - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_div_ss(__m128 __W, __mmask8 __U,__m128 __A, __m128 __B) { - __A = _mm_div_ss(__A, __B); - return __builtin_ia32_selectss_128(__U, __A, __W); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_div_ss(__mmask8 __U,__m128 __A, __m128 __B) { - __A = _mm_div_ss(__A, __B); - return __builtin_ia32_selectss_128(__U, __A, _mm_setzero_ps()); -} - -#define _mm_div_round_ss(A, B, R) \ - ((__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)-1, (int)(R))) - -#define _mm_mask_div_round_ss(W, U, A, B, R) \ - ((__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)(__m128)(W), (__mmask8)(U), \ - (int)(R))) - -#define _mm_maskz_div_round_ss(U, A, B, R) \ - ((__m128)__builtin_ia32_divss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U), (int)(R))) - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_div_sd(__m128d __W, __mmask8 __U,__m128d __A, __m128d __B) { - __A = _mm_div_sd(__A, __B); - return __builtin_ia32_selectsd_128(__U, __A, __W); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_div_sd(__mmask8 __U,__m128d __A, __m128d __B) { - __A = _mm_div_sd(__A, __B); - return __builtin_ia32_selectsd_128(__U, __A, _mm_setzero_pd()); -} - -#define _mm_div_round_sd(A, B, R) \ - ((__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1, (int)(R))) - -#define _mm_mask_div_round_sd(W, U, A, B, R) \ - ((__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)(__m128d)(W), \ - (__mmask8)(U), (int)(R))) - -#define _mm_maskz_div_round_sd(U, A, B, R) \ - ((__m128d)__builtin_ia32_divsd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U), (int)(R))) - -static __inline __m512d __DEFAULT_FN_ATTRS512 -_mm512_div_pd(__m512d __a, __m512d __b) -{ - return (__m512d)((__v8df)__a/(__v8df)__b); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_div_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, - (__v8df)_mm512_div_pd(__A, __B), - (__v8df)__W); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_maskz_div_pd(__mmask8 __U, __m512d __A, __m512d __B) { - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, - (__v8df)_mm512_div_pd(__A, __B), - (__v8df)_mm512_setzero_pd()); -} - -static __inline __m512 __DEFAULT_FN_ATTRS512 -_mm512_div_ps(__m512 __a, __m512 __b) -{ - return (__m512)((__v16sf)__a/(__v16sf)__b); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_div_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_div_ps(__A, __B), - (__v16sf)__W); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_maskz_div_ps(__mmask16 __U, __m512 __A, __m512 __B) { - return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_div_ps(__A, __B), - (__v16sf)_mm512_setzero_ps()); -} - -#define _mm512_div_round_pd(A, B, R) \ - ((__m512d)__builtin_ia32_divpd512((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), (int)(R))) - -#define _mm512_mask_div_round_pd(W, U, A, B, R) \ - ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_div_round_pd((A), (B), (R)), \ - (__v8df)(__m512d)(W))) - -#define _mm512_maskz_div_round_pd(U, A, B, R) \ - ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_div_round_pd((A), (B), (R)), \ - (__v8df)_mm512_setzero_pd())) - -#define _mm512_div_round_ps(A, B, R) \ - ((__m512)__builtin_ia32_divps512((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), (int)(R))) - -#define _mm512_mask_div_round_ps(W, U, A, B, R) \ - ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - (__v16sf)_mm512_div_round_ps((A), (B), (R)), \ - (__v16sf)(__m512)(W))) - -#define _mm512_maskz_div_round_ps(U, A, B, R) \ - ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - (__v16sf)_mm512_div_round_ps((A), (B), (R)), \ - (__v16sf)_mm512_setzero_ps())) - -#define _mm512_roundscale_ps(A, B) \ - ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(B), \ - (__v16sf)_mm512_undefined_ps(), \ - (__mmask16)-1, \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_mask_roundscale_ps(A, B, C, imm) \ - ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(C), (int)(imm), \ - (__v16sf)(__m512)(A), (__mmask16)(B), \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_maskz_roundscale_ps(A, B, imm) \ - ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(B), (int)(imm), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)(A), \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_mask_roundscale_round_ps(A, B, C, imm, R) \ - ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(C), (int)(imm), \ - (__v16sf)(__m512)(A), (__mmask16)(B), \ - (int)(R))) - -#define _mm512_maskz_roundscale_round_ps(A, B, imm, R) \ - ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(B), (int)(imm), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)(A), (int)(R))) - -#define _mm512_roundscale_round_ps(A, imm, R) \ - ((__m512)__builtin_ia32_rndscaleps_mask((__v16sf)(__m512)(A), (int)(imm), \ - (__v16sf)_mm512_undefined_ps(), \ - (__mmask16)-1, (int)(R))) - -#define _mm512_roundscale_pd(A, B) \ - ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(A), (int)(B), \ - (__v8df)_mm512_undefined_pd(), \ - (__mmask8)-1, \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_mask_roundscale_pd(A, B, C, imm) \ - ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(C), (int)(imm), \ - (__v8df)(__m512d)(A), (__mmask8)(B), \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_maskz_roundscale_pd(A, B, imm) \ - ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(B), (int)(imm), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)(A), \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_mask_roundscale_round_pd(A, B, C, imm, R) \ - ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(C), (int)(imm), \ - (__v8df)(__m512d)(A), (__mmask8)(B), \ - (int)(R))) - -#define _mm512_maskz_roundscale_round_pd(A, B, imm, R) \ - ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(B), (int)(imm), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)(A), (int)(R))) - -#define _mm512_roundscale_round_pd(A, imm, R) \ - ((__m512d)__builtin_ia32_rndscalepd_mask((__v8df)(__m512d)(A), (int)(imm), \ - (__v8df)_mm512_undefined_pd(), \ - (__mmask8)-1, (int)(R))) - -#define _mm512_fmadd_round_pd(A, B, C, R) \ - ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), \ - (__mmask8)-1, (int)(R))) - - -#define _mm512_mask_fmadd_round_pd(A, U, B, C, R) \ - ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R))) - - -#define _mm512_mask3_fmadd_round_pd(A, B, C, U, R) \ - ((__m512d)__builtin_ia32_vfmaddpd512_mask3((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R))) - - -#define _mm512_maskz_fmadd_round_pd(U, A, B, C, R) \ - ((__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R))) - - -#define _mm512_fmsub_round_pd(A, B, C, R) \ - ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - -(__v8df)(__m512d)(C), \ - (__mmask8)-1, (int)(R))) - - -#define _mm512_mask_fmsub_round_pd(A, U, B, C, R) \ - ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - -(__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R))) - - -#define _mm512_maskz_fmsub_round_pd(U, A, B, C, R) \ - ((__m512d)__builtin_ia32_vfmaddpd512_maskz((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - -(__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R))) - - -#define _mm512_fnmadd_round_pd(A, B, C, R) \ - ((__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), \ - (__mmask8)-1, (int)(R))) - - -#define _mm512_mask3_fnmadd_round_pd(A, B, C, U, R) \ - ((__m512d)__builtin_ia32_vfmaddpd512_mask3(-(__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R))) - - -#define _mm512_maskz_fnmadd_round_pd(U, A, B, C, R) \ - ((__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R))) - - -#define _mm512_fnmsub_round_pd(A, B, C, R) \ - ((__m512d)__builtin_ia32_vfmaddpd512_mask(-(__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - -(__v8df)(__m512d)(C), \ - (__mmask8)-1, (int)(R))) - - -#define _mm512_maskz_fnmsub_round_pd(U, A, B, C, R) \ - ((__m512d)__builtin_ia32_vfmaddpd512_maskz(-(__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - -(__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R))) - - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_fmadd_pd(__m512d __A, __m512d __B, __m512d __C) -{ - return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) __C, - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_fmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) -{ - return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) __C, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask3_fmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) -{ - return (__m512d) __builtin_ia32_vfmaddpd512_mask3 ((__v8df) __A, - (__v8df) __B, - (__v8df) __C, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_maskz_fmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) -{ - return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A, - (__v8df) __B, - (__v8df) __C, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_fmsub_pd(__m512d __A, __m512d __B, __m512d __C) -{ - return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, - (__v8df) __B, - -(__v8df) __C, - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_fmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) -{ - return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, - (__v8df) __B, - -(__v8df) __C, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_maskz_fmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) -{ - return (__m512d) __builtin_ia32_vfmaddpd512_maskz ((__v8df) __A, - (__v8df) __B, - -(__v8df) __C, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C) -{ - return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, - -(__v8df) __B, - (__v8df) __C, - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask3_fnmadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) -{ - return (__m512d) __builtin_ia32_vfmaddpd512_mask3 (-(__v8df) __A, - (__v8df) __B, - (__v8df) __C, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_maskz_fnmadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) -{ - return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A, - (__v8df) __B, - (__v8df) __C, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C) -{ - return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, - -(__v8df) __B, - -(__v8df) __C, - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_maskz_fnmsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) -{ - return (__m512d) __builtin_ia32_vfmaddpd512_maskz (-(__v8df) __A, - (__v8df) __B, - -(__v8df) __C, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_fmadd_round_ps(A, B, C, R) \ - ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), \ - (__mmask16)-1, (int)(R))) - - -#define _mm512_mask_fmadd_round_ps(A, U, B, C, R) \ - ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R))) - - -#define _mm512_mask3_fmadd_round_ps(A, B, C, U, R) \ - ((__m512)__builtin_ia32_vfmaddps512_mask3((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R))) - - -#define _mm512_maskz_fmadd_round_ps(U, A, B, C, R) \ - ((__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R))) - - -#define _mm512_fmsub_round_ps(A, B, C, R) \ - ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - -(__v16sf)(__m512)(C), \ - (__mmask16)-1, (int)(R))) - - -#define _mm512_mask_fmsub_round_ps(A, U, B, C, R) \ - ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - -(__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R))) - - -#define _mm512_maskz_fmsub_round_ps(U, A, B, C, R) \ - ((__m512)__builtin_ia32_vfmaddps512_maskz((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - -(__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R))) - - -#define _mm512_fnmadd_round_ps(A, B, C, R) \ - ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ - -(__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), \ - (__mmask16)-1, (int)(R))) - - -#define _mm512_mask3_fnmadd_round_ps(A, B, C, U, R) \ - ((__m512)__builtin_ia32_vfmaddps512_mask3(-(__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R))) - - -#define _mm512_maskz_fnmadd_round_ps(U, A, B, C, R) \ - ((__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R))) - - -#define _mm512_fnmsub_round_ps(A, B, C, R) \ - ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ - -(__v16sf)(__m512)(B), \ - -(__v16sf)(__m512)(C), \ - (__mmask16)-1, (int)(R))) - - -#define _mm512_maskz_fnmsub_round_ps(U, A, B, C, R) \ - ((__m512)__builtin_ia32_vfmaddps512_maskz(-(__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - -(__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R))) - - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_fmadd_ps(__m512 __A, __m512 __B, __m512 __C) -{ - return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - (__mmask16) -1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_fmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) -{ - return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask3_fmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) -{ - return (__m512) __builtin_ia32_vfmaddps512_mask3 ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_maskz_fmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) -{ - return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_fmsub_ps(__m512 __A, __m512 __B, __m512 __C) -{ - return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, - (__v16sf) __B, - -(__v16sf) __C, - (__mmask16) -1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_fmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) -{ - return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, - (__v16sf) __B, - -(__v16sf) __C, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_maskz_fmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) -{ - return (__m512) __builtin_ia32_vfmaddps512_maskz ((__v16sf) __A, - (__v16sf) __B, - -(__v16sf) __C, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C) -{ - return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, - -(__v16sf) __B, - (__v16sf) __C, - (__mmask16) -1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask3_fnmadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) -{ - return (__m512) __builtin_ia32_vfmaddps512_mask3 (-(__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_maskz_fnmadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) -{ - return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C) -{ - return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, - -(__v16sf) __B, - -(__v16sf) __C, - (__mmask16) -1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_maskz_fnmsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) -{ - return (__m512) __builtin_ia32_vfmaddps512_maskz (-(__v16sf) __A, - (__v16sf) __B, - -(__v16sf) __C, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_fmaddsub_round_pd(A, B, C, R) \ - ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), \ - (__mmask8)-1, (int)(R))) - - -#define _mm512_mask_fmaddsub_round_pd(A, U, B, C, R) \ - ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R))) - - -#define _mm512_mask3_fmaddsub_round_pd(A, B, C, U, R) \ - ((__m512d)__builtin_ia32_vfmaddsubpd512_mask3((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R))) - - -#define _mm512_maskz_fmaddsub_round_pd(U, A, B, C, R) \ - ((__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R))) - - -#define _mm512_fmsubadd_round_pd(A, B, C, R) \ - ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - -(__v8df)(__m512d)(C), \ - (__mmask8)-1, (int)(R))) - - -#define _mm512_mask_fmsubadd_round_pd(A, U, B, C, R) \ - ((__m512d)__builtin_ia32_vfmaddsubpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - -(__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R))) - - -#define _mm512_maskz_fmsubadd_round_pd(U, A, B, C, R) \ - ((__m512d)__builtin_ia32_vfmaddsubpd512_maskz((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - -(__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R))) - - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C) -{ - return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) __C, - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_fmaddsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) -{ - return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) __C, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask3_fmaddsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) -{ - return (__m512d) __builtin_ia32_vfmaddsubpd512_mask3 ((__v8df) __A, - (__v8df) __B, - (__v8df) __C, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_maskz_fmaddsub_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) -{ - return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A, - (__v8df) __B, - (__v8df) __C, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C) -{ - return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, - (__v8df) __B, - -(__v8df) __C, - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_fmsubadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) -{ - return (__m512d) __builtin_ia32_vfmaddsubpd512_mask ((__v8df) __A, - (__v8df) __B, - -(__v8df) __C, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_maskz_fmsubadd_pd(__mmask8 __U, __m512d __A, __m512d __B, __m512d __C) -{ - return (__m512d) __builtin_ia32_vfmaddsubpd512_maskz ((__v8df) __A, - (__v8df) __B, - -(__v8df) __C, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_fmaddsub_round_ps(A, B, C, R) \ - ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), \ - (__mmask16)-1, (int)(R))) - - -#define _mm512_mask_fmaddsub_round_ps(A, U, B, C, R) \ - ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R))) - - -#define _mm512_mask3_fmaddsub_round_ps(A, B, C, U, R) \ - ((__m512)__builtin_ia32_vfmaddsubps512_mask3((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R))) - - -#define _mm512_maskz_fmaddsub_round_ps(U, A, B, C, R) \ - ((__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R))) - - -#define _mm512_fmsubadd_round_ps(A, B, C, R) \ - ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - -(__v16sf)(__m512)(C), \ - (__mmask16)-1, (int)(R))) - - -#define _mm512_mask_fmsubadd_round_ps(A, U, B, C, R) \ - ((__m512)__builtin_ia32_vfmaddsubps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - -(__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R))) - - -#define _mm512_maskz_fmsubadd_round_ps(U, A, B, C, R) \ - ((__m512)__builtin_ia32_vfmaddsubps512_maskz((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - -(__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R))) - - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C) -{ - return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - (__mmask16) -1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_fmaddsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) -{ - return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask3_fmaddsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) -{ - return (__m512) __builtin_ia32_vfmaddsubps512_mask3 ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_maskz_fmaddsub_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) -{ - return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C) -{ - return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, - (__v16sf) __B, - -(__v16sf) __C, - (__mmask16) -1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_fmsubadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) -{ - return (__m512) __builtin_ia32_vfmaddsubps512_mask ((__v16sf) __A, - (__v16sf) __B, - -(__v16sf) __C, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_maskz_fmsubadd_ps(__mmask16 __U, __m512 __A, __m512 __B, __m512 __C) -{ - return (__m512) __builtin_ia32_vfmaddsubps512_maskz ((__v16sf) __A, - (__v16sf) __B, - -(__v16sf) __C, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_mask3_fmsub_round_pd(A, B, C, U, R) \ - ((__m512d)__builtin_ia32_vfmsubpd512_mask3((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R))) - - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask3_fmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) -{ - return (__m512d)__builtin_ia32_vfmsubpd512_mask3 ((__v8df) __A, - (__v8df) __B, - (__v8df) __C, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_mask3_fmsub_round_ps(A, B, C, U, R) \ - ((__m512)__builtin_ia32_vfmsubps512_mask3((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R))) - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask3_fmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) -{ - return (__m512)__builtin_ia32_vfmsubps512_mask3 ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_mask3_fmsubadd_round_pd(A, B, C, U, R) \ - ((__m512d)__builtin_ia32_vfmsubaddpd512_mask3((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R))) - - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask3_fmsubadd_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) -{ - return (__m512d)__builtin_ia32_vfmsubaddpd512_mask3 ((__v8df) __A, - (__v8df) __B, - (__v8df) __C, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_mask3_fmsubadd_round_ps(A, B, C, U, R) \ - ((__m512)__builtin_ia32_vfmsubaddps512_mask3((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R))) - - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask3_fmsubadd_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) -{ - return (__m512)__builtin_ia32_vfmsubaddps512_mask3 ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_mask_fnmadd_round_pd(A, U, B, C, R) \ - ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ - -(__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R))) - - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_fnmadd_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) -{ - return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, - -(__v8df) __B, - (__v8df) __C, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_mask_fnmadd_round_ps(A, U, B, C, R) \ - ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ - -(__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R))) - - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_fnmadd_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) -{ - return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, - -(__v16sf) __B, - (__v16sf) __C, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_mask_fnmsub_round_pd(A, U, B, C, R) \ - ((__m512d)__builtin_ia32_vfmaddpd512_mask((__v8df)(__m512d)(A), \ - -(__v8df)(__m512d)(B), \ - -(__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R))) - - -#define _mm512_mask3_fnmsub_round_pd(A, B, C, U, R) \ - ((__m512d)__builtin_ia32_vfmsubpd512_mask3(-(__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(C), \ - (__mmask8)(U), (int)(R))) - - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_fnmsub_pd(__m512d __A, __mmask8 __U, __m512d __B, __m512d __C) -{ - return (__m512d) __builtin_ia32_vfmaddpd512_mask ((__v8df) __A, - -(__v8df) __B, - -(__v8df) __C, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask3_fnmsub_pd(__m512d __A, __m512d __B, __m512d __C, __mmask8 __U) -{ - return (__m512d) __builtin_ia32_vfmsubpd512_mask3 (-(__v8df) __A, - (__v8df) __B, - (__v8df) __C, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_mask_fnmsub_round_ps(A, U, B, C, R) \ - ((__m512)__builtin_ia32_vfmaddps512_mask((__v16sf)(__m512)(A), \ - -(__v16sf)(__m512)(B), \ - -(__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R))) - - -#define _mm512_mask3_fnmsub_round_ps(A, B, C, U, R) \ - ((__m512)__builtin_ia32_vfmsubps512_mask3(-(__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(C), \ - (__mmask16)(U), (int)(R))) - - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_fnmsub_ps(__m512 __A, __mmask16 __U, __m512 __B, __m512 __C) -{ - return (__m512) __builtin_ia32_vfmaddps512_mask ((__v16sf) __A, - -(__v16sf) __B, - -(__v16sf) __C, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask3_fnmsub_ps(__m512 __A, __m512 __B, __m512 __C, __mmask16 __U) -{ - return (__m512) __builtin_ia32_vfmsubps512_mask3 (-(__v16sf) __A, - (__v16sf) __B, - (__v16sf) __C, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); -} - - - -/* Vector permutations */ - -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_permutex2var_epi32(__m512i __A, __m512i __I, __m512i __B) -{ - return (__m512i)__builtin_ia32_vpermi2vard512((__v16si)__A, (__v16si) __I, - (__v16si) __B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_permutex2var_epi32(__m512i __A, __mmask16 __U, __m512i __I, - __m512i __B) -{ - return (__m512i)__builtin_ia32_selectd_512(__U, - (__v16si)_mm512_permutex2var_epi32(__A, __I, __B), - (__v16si)__A); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask2_permutex2var_epi32(__m512i __A, __m512i __I, __mmask16 __U, - __m512i __B) -{ - return (__m512i)__builtin_ia32_selectd_512(__U, - (__v16si)_mm512_permutex2var_epi32(__A, __I, __B), - (__v16si)__I); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_permutex2var_epi32(__mmask16 __U, __m512i __A, __m512i __I, - __m512i __B) -{ - return (__m512i)__builtin_ia32_selectd_512(__U, - (__v16si)_mm512_permutex2var_epi32(__A, __I, __B), - (__v16si)_mm512_setzero_si512()); -} - -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_permutex2var_epi64(__m512i __A, __m512i __I, __m512i __B) -{ - return (__m512i)__builtin_ia32_vpermi2varq512((__v8di)__A, (__v8di) __I, - (__v8di) __B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_permutex2var_epi64(__m512i __A, __mmask8 __U, __m512i __I, - __m512i __B) -{ - return (__m512i)__builtin_ia32_selectq_512(__U, - (__v8di)_mm512_permutex2var_epi64(__A, __I, __B), - (__v8di)__A); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask2_permutex2var_epi64(__m512i __A, __m512i __I, __mmask8 __U, - __m512i __B) -{ - return (__m512i)__builtin_ia32_selectq_512(__U, - (__v8di)_mm512_permutex2var_epi64(__A, __I, __B), - (__v8di)__I); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_permutex2var_epi64(__mmask8 __U, __m512i __A, __m512i __I, - __m512i __B) -{ - return (__m512i)__builtin_ia32_selectq_512(__U, - (__v8di)_mm512_permutex2var_epi64(__A, __I, __B), - (__v8di)_mm512_setzero_si512()); -} - -#define _mm512_alignr_epi64(A, B, I) \ - ((__m512i)__builtin_ia32_alignq512((__v8di)(__m512i)(A), \ - (__v8di)(__m512i)(B), (int)(I))) - -#define _mm512_mask_alignr_epi64(W, U, A, B, imm) \ - ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ - (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \ - (__v8di)(__m512i)(W))) - -#define _mm512_maskz_alignr_epi64(U, A, B, imm) \ - ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ - (__v8di)_mm512_alignr_epi64((A), (B), (imm)), \ - (__v8di)_mm512_setzero_si512())) - -#define _mm512_alignr_epi32(A, B, I) \ - ((__m512i)__builtin_ia32_alignd512((__v16si)(__m512i)(A), \ - (__v16si)(__m512i)(B), (int)(I))) - -#define _mm512_mask_alignr_epi32(W, U, A, B, imm) \ - ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ - (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \ - (__v16si)(__m512i)(W))) - -#define _mm512_maskz_alignr_epi32(U, A, B, imm) \ - ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ - (__v16si)_mm512_alignr_epi32((A), (B), (imm)), \ - (__v16si)_mm512_setzero_si512())) -/* Vector Extract */ - -#define _mm512_extractf64x4_pd(A, I) \ - ((__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(I), \ - (__v4df)_mm256_undefined_pd(), \ - (__mmask8)-1)) - -#define _mm512_mask_extractf64x4_pd(W, U, A, imm) \ - ((__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \ - (__v4df)(__m256d)(W), \ - (__mmask8)(U))) - -#define _mm512_maskz_extractf64x4_pd(U, A, imm) \ - ((__m256d)__builtin_ia32_extractf64x4_mask((__v8df)(__m512d)(A), (int)(imm), \ - (__v4df)_mm256_setzero_pd(), \ - (__mmask8)(U))) - -#define _mm512_extractf32x4_ps(A, I) \ - ((__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(I), \ - (__v4sf)_mm_undefined_ps(), \ - (__mmask8)-1)) - -#define _mm512_mask_extractf32x4_ps(W, U, A, imm) \ - ((__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \ - (__v4sf)(__m128)(W), \ - (__mmask8)(U))) - -#define _mm512_maskz_extractf32x4_ps(U, A, imm) \ - ((__m128)__builtin_ia32_extractf32x4_mask((__v16sf)(__m512)(A), (int)(imm), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U))) - -/* Vector Blend */ - -static __inline __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_blend_pd(__mmask8 __U, __m512d __A, __m512d __W) -{ - return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U, - (__v8df) __W, - (__v8df) __A); -} - -static __inline __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_blend_ps(__mmask16 __U, __m512 __A, __m512 __W) -{ - return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U, - (__v16sf) __W, - (__v16sf) __A); -} - -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_blend_epi64(__mmask8 __U, __m512i __A, __m512i __W) -{ - return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U, - (__v8di) __W, - (__v8di) __A); -} - -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_blend_epi32(__mmask16 __U, __m512i __A, __m512i __W) -{ - return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U, - (__v16si) __W, - (__v16si) __A); -} - -/* Compare */ - -#define _mm512_cmp_round_ps_mask(A, B, P, R) \ - ((__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), (int)(P), \ - (__mmask16)-1, (int)(R))) - -#define _mm512_mask_cmp_round_ps_mask(U, A, B, P, R) \ - ((__mmask16)__builtin_ia32_cmpps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), (int)(P), \ - (__mmask16)(U), (int)(R))) - -#define _mm512_cmp_ps_mask(A, B, P) \ - _mm512_cmp_round_ps_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION) -#define _mm512_mask_cmp_ps_mask(U, A, B, P) \ - _mm512_mask_cmp_round_ps_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION) - -#define _mm512_cmpeq_ps_mask(A, B) \ - _mm512_cmp_ps_mask((A), (B), _CMP_EQ_OQ) -#define _mm512_mask_cmpeq_ps_mask(k, A, B) \ - _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_EQ_OQ) - -#define _mm512_cmplt_ps_mask(A, B) \ - _mm512_cmp_ps_mask((A), (B), _CMP_LT_OS) -#define _mm512_mask_cmplt_ps_mask(k, A, B) \ - _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_LT_OS) - -#define _mm512_cmple_ps_mask(A, B) \ - _mm512_cmp_ps_mask((A), (B), _CMP_LE_OS) -#define _mm512_mask_cmple_ps_mask(k, A, B) \ - _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_LE_OS) - -#define _mm512_cmpunord_ps_mask(A, B) \ - _mm512_cmp_ps_mask((A), (B), _CMP_UNORD_Q) -#define _mm512_mask_cmpunord_ps_mask(k, A, B) \ - _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_UNORD_Q) - -#define _mm512_cmpneq_ps_mask(A, B) \ - _mm512_cmp_ps_mask((A), (B), _CMP_NEQ_UQ) -#define _mm512_mask_cmpneq_ps_mask(k, A, B) \ - _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NEQ_UQ) - -#define _mm512_cmpnlt_ps_mask(A, B) \ - _mm512_cmp_ps_mask((A), (B), _CMP_NLT_US) -#define _mm512_mask_cmpnlt_ps_mask(k, A, B) \ - _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NLT_US) - -#define _mm512_cmpnle_ps_mask(A, B) \ - _mm512_cmp_ps_mask((A), (B), _CMP_NLE_US) -#define _mm512_mask_cmpnle_ps_mask(k, A, B) \ - _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_NLE_US) - -#define _mm512_cmpord_ps_mask(A, B) \ - _mm512_cmp_ps_mask((A), (B), _CMP_ORD_Q) -#define _mm512_mask_cmpord_ps_mask(k, A, B) \ - _mm512_mask_cmp_ps_mask((k), (A), (B), _CMP_ORD_Q) - -#define _mm512_cmp_round_pd_mask(A, B, P, R) \ - ((__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), (int)(P), \ - (__mmask8)-1, (int)(R))) - -#define _mm512_mask_cmp_round_pd_mask(U, A, B, P, R) \ - ((__mmask8)__builtin_ia32_cmppd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), (int)(P), \ - (__mmask8)(U), (int)(R))) - -#define _mm512_cmp_pd_mask(A, B, P) \ - _mm512_cmp_round_pd_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION) -#define _mm512_mask_cmp_pd_mask(U, A, B, P) \ - _mm512_mask_cmp_round_pd_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION) - -#define _mm512_cmpeq_pd_mask(A, B) \ - _mm512_cmp_pd_mask((A), (B), _CMP_EQ_OQ) -#define _mm512_mask_cmpeq_pd_mask(k, A, B) \ - _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_EQ_OQ) - -#define _mm512_cmplt_pd_mask(A, B) \ - _mm512_cmp_pd_mask((A), (B), _CMP_LT_OS) -#define _mm512_mask_cmplt_pd_mask(k, A, B) \ - _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_LT_OS) - -#define _mm512_cmple_pd_mask(A, B) \ - _mm512_cmp_pd_mask((A), (B), _CMP_LE_OS) -#define _mm512_mask_cmple_pd_mask(k, A, B) \ - _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_LE_OS) - -#define _mm512_cmpunord_pd_mask(A, B) \ - _mm512_cmp_pd_mask((A), (B), _CMP_UNORD_Q) -#define _mm512_mask_cmpunord_pd_mask(k, A, B) \ - _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_UNORD_Q) - -#define _mm512_cmpneq_pd_mask(A, B) \ - _mm512_cmp_pd_mask((A), (B), _CMP_NEQ_UQ) -#define _mm512_mask_cmpneq_pd_mask(k, A, B) \ - _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NEQ_UQ) - -#define _mm512_cmpnlt_pd_mask(A, B) \ - _mm512_cmp_pd_mask((A), (B), _CMP_NLT_US) -#define _mm512_mask_cmpnlt_pd_mask(k, A, B) \ - _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NLT_US) - -#define _mm512_cmpnle_pd_mask(A, B) \ - _mm512_cmp_pd_mask((A), (B), _CMP_NLE_US) -#define _mm512_mask_cmpnle_pd_mask(k, A, B) \ - _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_NLE_US) - -#define _mm512_cmpord_pd_mask(A, B) \ - _mm512_cmp_pd_mask((A), (B), _CMP_ORD_Q) -#define _mm512_mask_cmpord_pd_mask(k, A, B) \ - _mm512_mask_cmp_pd_mask((k), (A), (B), _CMP_ORD_Q) - -/* Conversion */ - -#define _mm512_cvtt_roundps_epu32(A, R) \ - ((__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \ - (__v16si)_mm512_undefined_epi32(), \ - (__mmask16)-1, (int)(R))) - -#define _mm512_mask_cvtt_roundps_epu32(W, U, A, R) \ - ((__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \ - (__v16si)(__m512i)(W), \ - (__mmask16)(U), (int)(R))) - -#define _mm512_maskz_cvtt_roundps_epu32(U, A, R) \ - ((__m512i)__builtin_ia32_cvttps2udq512_mask((__v16sf)(__m512)(A), \ - (__v16si)_mm512_setzero_si512(), \ - (__mmask16)(U), (int)(R))) - - -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_cvttps_epu32(__m512 __A) -{ - return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) -1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvttps_epu32 (__m512i __W, __mmask16 __U, __m512 __A) -{ - return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A, - (__v16si) __W, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvttps_epu32 (__mmask16 __U, __m512 __A) -{ - return (__m512i) __builtin_ia32_cvttps2udq512_mask ((__v16sf) __A, - (__v16si) _mm512_setzero_si512 (), - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_cvt_roundepi32_ps(A, R) \ - ((__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)-1, (int)(R))) - -#define _mm512_mask_cvt_roundepi32_ps(W, U, A, R) \ - ((__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \ - (__v16sf)(__m512)(W), \ - (__mmask16)(U), (int)(R))) - -#define _mm512_maskz_cvt_roundepi32_ps(U, A, R) \ - ((__m512)__builtin_ia32_cvtdq2ps512_mask((__v16si)(__m512i)(A), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)(U), (int)(R))) - -#define _mm512_cvt_roundepu32_ps(A, R) \ - ((__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)-1, (int)(R))) - -#define _mm512_mask_cvt_roundepu32_ps(W, U, A, R) \ - ((__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \ - (__v16sf)(__m512)(W), \ - (__mmask16)(U), (int)(R))) - -#define _mm512_maskz_cvt_roundepu32_ps(U, A, R) \ - ((__m512)__builtin_ia32_cvtudq2ps512_mask((__v16si)(__m512i)(A), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)(U), (int)(R))) - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_cvtepu32_ps (__m512i __A) -{ - return (__m512)__builtin_convertvector((__v16su)__A, __v16sf); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtepu32_ps (__m512 __W, __mmask16 __U, __m512i __A) -{ - return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_cvtepu32_ps(__A), - (__v16sf)__W); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtepu32_ps (__mmask16 __U, __m512i __A) -{ - return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_cvtepu32_ps(__A), - (__v16sf)_mm512_setzero_ps()); -} - -static __inline __m512d __DEFAULT_FN_ATTRS512 -_mm512_cvtepi32_pd(__m256i __A) -{ - return (__m512d)__builtin_convertvector((__v8si)__A, __v8df); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtepi32_pd (__m512d __W, __mmask8 __U, __m256i __A) -{ - return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, - (__v8df)_mm512_cvtepi32_pd(__A), - (__v8df)__W); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtepi32_pd (__mmask8 __U, __m256i __A) -{ - return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, - (__v8df)_mm512_cvtepi32_pd(__A), - (__v8df)_mm512_setzero_pd()); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_cvtepi32lo_pd(__m512i __A) -{ - return (__m512d) _mm512_cvtepi32_pd(_mm512_castsi512_si256(__A)); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtepi32lo_pd(__m512d __W, __mmask8 __U,__m512i __A) -{ - return (__m512d) _mm512_mask_cvtepi32_pd(__W, __U, _mm512_castsi512_si256(__A)); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_cvtepi32_ps (__m512i __A) -{ - return (__m512)__builtin_convertvector((__v16si)__A, __v16sf); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtepi32_ps (__m512 __W, __mmask16 __U, __m512i __A) -{ - return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_cvtepi32_ps(__A), - (__v16sf)__W); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtepi32_ps (__mmask16 __U, __m512i __A) -{ - return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_cvtepi32_ps(__A), - (__v16sf)_mm512_setzero_ps()); -} - -static __inline __m512d __DEFAULT_FN_ATTRS512 -_mm512_cvtepu32_pd(__m256i __A) -{ - return (__m512d)__builtin_convertvector((__v8su)__A, __v8df); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtepu32_pd (__m512d __W, __mmask8 __U, __m256i __A) -{ - return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, - (__v8df)_mm512_cvtepu32_pd(__A), - (__v8df)__W); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtepu32_pd (__mmask8 __U, __m256i __A) -{ - return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, - (__v8df)_mm512_cvtepu32_pd(__A), - (__v8df)_mm512_setzero_pd()); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_cvtepu32lo_pd(__m512i __A) -{ - return (__m512d) _mm512_cvtepu32_pd(_mm512_castsi512_si256(__A)); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtepu32lo_pd(__m512d __W, __mmask8 __U,__m512i __A) -{ - return (__m512d) _mm512_mask_cvtepu32_pd(__W, __U, _mm512_castsi512_si256(__A)); -} - -#define _mm512_cvt_roundpd_ps(A, R) \ - ((__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \ - (__v8sf)_mm256_setzero_ps(), \ - (__mmask8)-1, (int)(R))) - -#define _mm512_mask_cvt_roundpd_ps(W, U, A, R) \ - ((__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \ - (__v8sf)(__m256)(W), (__mmask8)(U), \ - (int)(R))) - -#define _mm512_maskz_cvt_roundpd_ps(U, A, R) \ - ((__m256)__builtin_ia32_cvtpd2ps512_mask((__v8df)(__m512d)(A), \ - (__v8sf)_mm256_setzero_ps(), \ - (__mmask8)(U), (int)(R))) - -static __inline__ __m256 __DEFAULT_FN_ATTRS512 -_mm512_cvtpd_ps (__m512d __A) -{ - return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A, - (__v8sf) _mm256_undefined_ps (), - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtpd_ps (__m256 __W, __mmask8 __U, __m512d __A) -{ - return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A, - (__v8sf) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtpd_ps (__mmask8 __U, __m512d __A) -{ - return (__m256) __builtin_ia32_cvtpd2ps512_mask ((__v8df) __A, - (__v8sf) _mm256_setzero_ps (), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_cvtpd_pslo (__m512d __A) -{ - return (__m512) __builtin_shufflevector((__v8sf) _mm512_cvtpd_ps(__A), - (__v8sf) _mm256_setzero_ps (), - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtpd_pslo (__m512 __W, __mmask8 __U,__m512d __A) -{ - return (__m512) __builtin_shufflevector ( - (__v8sf) _mm512_mask_cvtpd_ps (_mm512_castps512_ps256(__W), - __U, __A), - (__v8sf) _mm256_setzero_ps (), - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); -} - -#define _mm512_cvt_roundps_ph(A, I) \ - ((__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \ - (__v16hi)_mm256_undefined_si256(), \ - (__mmask16)-1)) - -#define _mm512_mask_cvt_roundps_ph(U, W, A, I) \ - ((__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \ - (__v16hi)(__m256i)(U), \ - (__mmask16)(W))) - -#define _mm512_maskz_cvt_roundps_ph(W, A, I) \ - ((__m256i)__builtin_ia32_vcvtps2ph512_mask((__v16sf)(__m512)(A), (int)(I), \ - (__v16hi)_mm256_setzero_si256(), \ - (__mmask16)(W))) - -#define _mm512_cvtps_ph _mm512_cvt_roundps_ph -#define _mm512_mask_cvtps_ph _mm512_mask_cvt_roundps_ph -#define _mm512_maskz_cvtps_ph _mm512_maskz_cvt_roundps_ph - -#define _mm512_cvt_roundph_ps(A, R) \ - ((__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \ - (__v16sf)_mm512_undefined_ps(), \ - (__mmask16)-1, (int)(R))) - -#define _mm512_mask_cvt_roundph_ps(W, U, A, R) \ - ((__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \ - (__v16sf)(__m512)(W), \ - (__mmask16)(U), (int)(R))) - -#define _mm512_maskz_cvt_roundph_ps(U, A, R) \ - ((__m512)__builtin_ia32_vcvtph2ps512_mask((__v16hi)(__m256i)(A), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)(U), (int)(R))) - - -static __inline __m512 __DEFAULT_FN_ATTRS512 -_mm512_cvtph_ps(__m256i __A) -{ - return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A, - (__v16sf) - _mm512_setzero_ps (), - (__mmask16) -1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtph_ps (__m512 __W, __mmask16 __U, __m256i __A) -{ - return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A, - (__v16sf) __W, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtph_ps (__mmask16 __U, __m256i __A) -{ - return (__m512) __builtin_ia32_vcvtph2ps512_mask ((__v16hi) __A, - (__v16sf) _mm512_setzero_ps (), - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_cvtt_roundpd_epi32(A, R) \ - ((__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \ - (__v8si)_mm256_setzero_si256(), \ - (__mmask8)-1, (int)(R))) - -#define _mm512_mask_cvtt_roundpd_epi32(W, U, A, R) \ - ((__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \ - (__v8si)(__m256i)(W), \ - (__mmask8)(U), (int)(R))) - -#define _mm512_maskz_cvtt_roundpd_epi32(U, A, R) \ - ((__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df)(__m512d)(A), \ - (__v8si)_mm256_setzero_si256(), \ - (__mmask8)(U), (int)(R))) - -static __inline __m256i __DEFAULT_FN_ATTRS512 -_mm512_cvttpd_epi32(__m512d __a) -{ - return (__m256i)__builtin_ia32_cvttpd2dq512_mask((__v8df) __a, - (__v8si)_mm256_setzero_si256(), - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvttpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A) -{ - return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A, - (__v8si) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvttpd_epi32 (__mmask8 __U, __m512d __A) -{ - return (__m256i) __builtin_ia32_cvttpd2dq512_mask ((__v8df) __A, - (__v8si) _mm256_setzero_si256 (), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_cvtt_roundps_epi32(A, R) \ - ((__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \ - (__v16si)_mm512_setzero_si512(), \ - (__mmask16)-1, (int)(R))) - -#define _mm512_mask_cvtt_roundps_epi32(W, U, A, R) \ - ((__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \ - (__v16si)(__m512i)(W), \ - (__mmask16)(U), (int)(R))) - -#define _mm512_maskz_cvtt_roundps_epi32(U, A, R) \ - ((__m512i)__builtin_ia32_cvttps2dq512_mask((__v16sf)(__m512)(A), \ - (__v16si)_mm512_setzero_si512(), \ - (__mmask16)(U), (int)(R))) - -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_cvttps_epi32(__m512 __a) -{ - return (__m512i) - __builtin_ia32_cvttps2dq512_mask((__v16sf) __a, - (__v16si) _mm512_setzero_si512 (), - (__mmask16) -1, _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvttps_epi32 (__m512i __W, __mmask16 __U, __m512 __A) -{ - return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A, - (__v16si) __W, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvttps_epi32 (__mmask16 __U, __m512 __A) -{ - return (__m512i) __builtin_ia32_cvttps2dq512_mask ((__v16sf) __A, - (__v16si) _mm512_setzero_si512 (), - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_cvt_roundps_epi32(A, R) \ - ((__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \ - (__v16si)_mm512_setzero_si512(), \ - (__mmask16)-1, (int)(R))) - -#define _mm512_mask_cvt_roundps_epi32(W, U, A, R) \ - ((__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \ - (__v16si)(__m512i)(W), \ - (__mmask16)(U), (int)(R))) - -#define _mm512_maskz_cvt_roundps_epi32(U, A, R) \ - ((__m512i)__builtin_ia32_cvtps2dq512_mask((__v16sf)(__m512)(A), \ - (__v16si)_mm512_setzero_si512(), \ - (__mmask16)(U), (int)(R))) - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_cvtps_epi32 (__m512 __A) -{ - return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A, - (__v16si) _mm512_undefined_epi32 (), - (__mmask16) -1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtps_epi32 (__m512i __W, __mmask16 __U, __m512 __A) -{ - return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A, - (__v16si) __W, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtps_epi32 (__mmask16 __U, __m512 __A) -{ - return (__m512i) __builtin_ia32_cvtps2dq512_mask ((__v16sf) __A, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_cvt_roundpd_epi32(A, R) \ - ((__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \ - (__v8si)_mm256_setzero_si256(), \ - (__mmask8)-1, (int)(R))) - -#define _mm512_mask_cvt_roundpd_epi32(W, U, A, R) \ - ((__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \ - (__v8si)(__m256i)(W), \ - (__mmask8)(U), (int)(R))) - -#define _mm512_maskz_cvt_roundpd_epi32(U, A, R) \ - ((__m256i)__builtin_ia32_cvtpd2dq512_mask((__v8df)(__m512d)(A), \ - (__v8si)_mm256_setzero_si256(), \ - (__mmask8)(U), (int)(R))) - -static __inline__ __m256i __DEFAULT_FN_ATTRS512 -_mm512_cvtpd_epi32 (__m512d __A) -{ - return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A, - (__v8si) - _mm256_undefined_si256 (), - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtpd_epi32 (__m256i __W, __mmask8 __U, __m512d __A) -{ - return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A, - (__v8si) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtpd_epi32 (__mmask8 __U, __m512d __A) -{ - return (__m256i) __builtin_ia32_cvtpd2dq512_mask ((__v8df) __A, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_cvt_roundps_epu32(A, R) \ - ((__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \ - (__v16si)_mm512_setzero_si512(), \ - (__mmask16)-1, (int)(R))) - -#define _mm512_mask_cvt_roundps_epu32(W, U, A, R) \ - ((__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \ - (__v16si)(__m512i)(W), \ - (__mmask16)(U), (int)(R))) - -#define _mm512_maskz_cvt_roundps_epu32(U, A, R) \ - ((__m512i)__builtin_ia32_cvtps2udq512_mask((__v16sf)(__m512)(A), \ - (__v16si)_mm512_setzero_si512(), \ - (__mmask16)(U), (int)(R))) - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_cvtps_epu32 ( __m512 __A) -{ - return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A,\ - (__v16si)\ - _mm512_undefined_epi32 (), - (__mmask16) -1,\ - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtps_epu32 (__m512i __W, __mmask16 __U, __m512 __A) -{ - return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A, - (__v16si) __W, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtps_epu32 ( __mmask16 __U, __m512 __A) -{ - return (__m512i) __builtin_ia32_cvtps2udq512_mask ((__v16sf) __A, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) __U , - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_cvt_roundpd_epu32(A, R) \ - ((__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \ - (__v8si)_mm256_setzero_si256(), \ - (__mmask8)-1, (int)(R))) - -#define _mm512_mask_cvt_roundpd_epu32(W, U, A, R) \ - ((__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \ - (__v8si)(__m256i)(W), \ - (__mmask8)(U), (int)(R))) - -#define _mm512_maskz_cvt_roundpd_epu32(U, A, R) \ - ((__m256i)__builtin_ia32_cvtpd2udq512_mask((__v8df)(__m512d)(A), \ - (__v8si)_mm256_setzero_si256(), \ - (__mmask8)(U), (int)(R))) - -static __inline__ __m256i __DEFAULT_FN_ATTRS512 -_mm512_cvtpd_epu32 (__m512d __A) -{ - return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A, - (__v8si) - _mm256_undefined_si256 (), - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A) -{ - return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A, - (__v8si) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtpd_epu32 (__mmask8 __U, __m512d __A) -{ - return (__m256i) __builtin_ia32_cvtpd2udq512_mask ((__v8df) __A, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ double __DEFAULT_FN_ATTRS512 -_mm512_cvtsd_f64(__m512d __a) -{ - return __a[0]; -} - -static __inline__ float __DEFAULT_FN_ATTRS512 -_mm512_cvtss_f32(__m512 __a) -{ - return __a[0]; -} - -/* Unpack and Interleave */ - -static __inline __m512d __DEFAULT_FN_ATTRS512 -_mm512_unpackhi_pd(__m512d __a, __m512d __b) -{ - return (__m512d)__builtin_shufflevector((__v8df)__a, (__v8df)__b, - 1, 9, 1+2, 9+2, 1+4, 9+4, 1+6, 9+6); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_unpackhi_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) -{ - return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, - (__v8df)_mm512_unpackhi_pd(__A, __B), - (__v8df)__W); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_maskz_unpackhi_pd(__mmask8 __U, __m512d __A, __m512d __B) -{ - return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, - (__v8df)_mm512_unpackhi_pd(__A, __B), - (__v8df)_mm512_setzero_pd()); -} - -static __inline __m512d __DEFAULT_FN_ATTRS512 -_mm512_unpacklo_pd(__m512d __a, __m512d __b) -{ - return (__m512d)__builtin_shufflevector((__v8df)__a, (__v8df)__b, - 0, 8, 0+2, 8+2, 0+4, 8+4, 0+6, 8+6); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_unpacklo_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) -{ - return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, - (__v8df)_mm512_unpacklo_pd(__A, __B), - (__v8df)__W); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_maskz_unpacklo_pd (__mmask8 __U, __m512d __A, __m512d __B) -{ - return (__m512d)__builtin_ia32_selectpd_512((__mmask8) __U, - (__v8df)_mm512_unpacklo_pd(__A, __B), - (__v8df)_mm512_setzero_pd()); -} - -static __inline __m512 __DEFAULT_FN_ATTRS512 -_mm512_unpackhi_ps(__m512 __a, __m512 __b) -{ - return (__m512)__builtin_shufflevector((__v16sf)__a, (__v16sf)__b, - 2, 18, 3, 19, - 2+4, 18+4, 3+4, 19+4, - 2+8, 18+8, 3+8, 19+8, - 2+12, 18+12, 3+12, 19+12); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_unpackhi_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) -{ - return (__m512)__builtin_ia32_selectps_512((__mmask16) __U, - (__v16sf)_mm512_unpackhi_ps(__A, __B), - (__v16sf)__W); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_maskz_unpackhi_ps (__mmask16 __U, __m512 __A, __m512 __B) -{ - return (__m512)__builtin_ia32_selectps_512((__mmask16) __U, - (__v16sf)_mm512_unpackhi_ps(__A, __B), - (__v16sf)_mm512_setzero_ps()); -} - -static __inline __m512 __DEFAULT_FN_ATTRS512 -_mm512_unpacklo_ps(__m512 __a, __m512 __b) -{ - return (__m512)__builtin_shufflevector((__v16sf)__a, (__v16sf)__b, - 0, 16, 1, 17, - 0+4, 16+4, 1+4, 17+4, - 0+8, 16+8, 1+8, 17+8, - 0+12, 16+12, 1+12, 17+12); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_unpacklo_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) -{ - return (__m512)__builtin_ia32_selectps_512((__mmask16) __U, - (__v16sf)_mm512_unpacklo_ps(__A, __B), - (__v16sf)__W); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_maskz_unpacklo_ps (__mmask16 __U, __m512 __A, __m512 __B) -{ - return (__m512)__builtin_ia32_selectps_512((__mmask16) __U, - (__v16sf)_mm512_unpacklo_ps(__A, __B), - (__v16sf)_mm512_setzero_ps()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_unpackhi_epi32(__m512i __A, __m512i __B) -{ - return (__m512i)__builtin_shufflevector((__v16si)__A, (__v16si)__B, - 2, 18, 3, 19, - 2+4, 18+4, 3+4, 19+4, - 2+8, 18+8, 3+8, 19+8, - 2+12, 18+12, 3+12, 19+12); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_unpackhi_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U, - (__v16si)_mm512_unpackhi_epi32(__A, __B), - (__v16si)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_unpackhi_epi32(__mmask16 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U, - (__v16si)_mm512_unpackhi_epi32(__A, __B), - (__v16si)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_unpacklo_epi32(__m512i __A, __m512i __B) -{ - return (__m512i)__builtin_shufflevector((__v16si)__A, (__v16si)__B, - 0, 16, 1, 17, - 0+4, 16+4, 1+4, 17+4, - 0+8, 16+8, 1+8, 17+8, - 0+12, 16+12, 1+12, 17+12); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_unpacklo_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U, - (__v16si)_mm512_unpacklo_epi32(__A, __B), - (__v16si)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_unpacklo_epi32(__mmask16 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectd_512((__mmask16) __U, - (__v16si)_mm512_unpacklo_epi32(__A, __B), - (__v16si)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_unpackhi_epi64(__m512i __A, __m512i __B) -{ - return (__m512i)__builtin_shufflevector((__v8di)__A, (__v8di)__B, - 1, 9, 1+2, 9+2, 1+4, 9+4, 1+6, 9+6); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_unpackhi_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U, - (__v8di)_mm512_unpackhi_epi64(__A, __B), - (__v8di)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_unpackhi_epi64(__mmask8 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U, - (__v8di)_mm512_unpackhi_epi64(__A, __B), - (__v8di)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_unpacklo_epi64 (__m512i __A, __m512i __B) -{ - return (__m512i)__builtin_shufflevector((__v8di)__A, (__v8di)__B, - 0, 8, 0+2, 8+2, 0+4, 8+4, 0+6, 8+6); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_unpacklo_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U, - (__v8di)_mm512_unpacklo_epi64(__A, __B), - (__v8di)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_unpacklo_epi64 (__mmask8 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8) __U, - (__v8di)_mm512_unpacklo_epi64(__A, __B), - (__v8di)_mm512_setzero_si512()); -} - - -/* SIMD load ops */ - -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_loadu_si512 (void const *__P) -{ - struct __loadu_si512 { - __m512i_u __v; - } __attribute__((__packed__, __may_alias__)); - return ((const struct __loadu_si512*)__P)->__v; -} - -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_loadu_epi32 (void const *__P) -{ - struct __loadu_epi32 { - __m512i_u __v; - } __attribute__((__packed__, __may_alias__)); - return ((const struct __loadu_epi32*)__P)->__v; -} - -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_loadu_epi32 (__m512i __W, __mmask16 __U, void const *__P) -{ - return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *) __P, - (__v16si) __W, - (__mmask16) __U); -} - - -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_loadu_epi32(__mmask16 __U, void const *__P) -{ - return (__m512i) __builtin_ia32_loaddqusi512_mask ((const int *)__P, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) __U); -} - -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_loadu_epi64 (void const *__P) -{ - struct __loadu_epi64 { - __m512i_u __v; - } __attribute__((__packed__, __may_alias__)); - return ((const struct __loadu_epi64*)__P)->__v; -} - -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_loadu_epi64 (__m512i __W, __mmask8 __U, void const *__P) -{ - return (__m512i) __builtin_ia32_loaddqudi512_mask ((const long long *) __P, - (__v8di) __W, - (__mmask8) __U); -} - -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_loadu_epi64(__mmask8 __U, void const *__P) -{ - return (__m512i) __builtin_ia32_loaddqudi512_mask ((const long long *)__P, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) __U); -} - -static __inline __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_loadu_ps (__m512 __W, __mmask16 __U, void const *__P) -{ - return (__m512) __builtin_ia32_loadups512_mask ((const float *) __P, - (__v16sf) __W, - (__mmask16) __U); -} - -static __inline __m512 __DEFAULT_FN_ATTRS512 -_mm512_maskz_loadu_ps(__mmask16 __U, void const *__P) -{ - return (__m512) __builtin_ia32_loadups512_mask ((const float *)__P, - (__v16sf) - _mm512_setzero_ps (), - (__mmask16) __U); -} - -static __inline __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_loadu_pd (__m512d __W, __mmask8 __U, void const *__P) -{ - return (__m512d) __builtin_ia32_loadupd512_mask ((const double *) __P, - (__v8df) __W, - (__mmask8) __U); -} - -static __inline __m512d __DEFAULT_FN_ATTRS512 -_mm512_maskz_loadu_pd(__mmask8 __U, void const *__P) -{ - return (__m512d) __builtin_ia32_loadupd512_mask ((const double *)__P, - (__v8df) - _mm512_setzero_pd (), - (__mmask8) __U); -} - -static __inline __m512d __DEFAULT_FN_ATTRS512 -_mm512_loadu_pd(void const *__p) -{ - struct __loadu_pd { - __m512d_u __v; - } __attribute__((__packed__, __may_alias__)); - return ((const struct __loadu_pd*)__p)->__v; -} - -static __inline __m512 __DEFAULT_FN_ATTRS512 -_mm512_loadu_ps(void const *__p) -{ - struct __loadu_ps { - __m512_u __v; - } __attribute__((__packed__, __may_alias__)); - return ((const struct __loadu_ps*)__p)->__v; -} - -static __inline __m512 __DEFAULT_FN_ATTRS512 -_mm512_load_ps(void const *__p) -{ - return *(const __m512*)__p; -} - -static __inline __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_load_ps (__m512 __W, __mmask16 __U, void const *__P) -{ - return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *) __P, - (__v16sf) __W, - (__mmask16) __U); -} - -static __inline __m512 __DEFAULT_FN_ATTRS512 -_mm512_maskz_load_ps(__mmask16 __U, void const *__P) -{ - return (__m512) __builtin_ia32_loadaps512_mask ((const __v16sf *)__P, - (__v16sf) - _mm512_setzero_ps (), - (__mmask16) __U); -} - -static __inline __m512d __DEFAULT_FN_ATTRS512 -_mm512_load_pd(void const *__p) -{ - return *(const __m512d*)__p; -} - -static __inline __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_load_pd (__m512d __W, __mmask8 __U, void const *__P) -{ - return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *) __P, - (__v8df) __W, - (__mmask8) __U); -} - -static __inline __m512d __DEFAULT_FN_ATTRS512 -_mm512_maskz_load_pd(__mmask8 __U, void const *__P) -{ - return (__m512d) __builtin_ia32_loadapd512_mask ((const __v8df *)__P, - (__v8df) - _mm512_setzero_pd (), - (__mmask8) __U); -} - -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_load_si512 (void const *__P) -{ - return *(const __m512i *) __P; -} - -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_load_epi32 (void const *__P) -{ - return *(const __m512i *) __P; -} - -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_load_epi64 (void const *__P) -{ - return *(const __m512i *) __P; -} - -/* SIMD store ops */ - -static __inline void __DEFAULT_FN_ATTRS512 -_mm512_storeu_epi64 (void *__P, __m512i __A) -{ - struct __storeu_epi64 { - __m512i_u __v; - } __attribute__((__packed__, __may_alias__)); - ((struct __storeu_epi64*)__P)->__v = __A; -} - -static __inline void __DEFAULT_FN_ATTRS512 -_mm512_mask_storeu_epi64(void *__P, __mmask8 __U, __m512i __A) -{ - __builtin_ia32_storedqudi512_mask ((long long *)__P, (__v8di) __A, - (__mmask8) __U); -} - -static __inline void __DEFAULT_FN_ATTRS512 -_mm512_storeu_si512 (void *__P, __m512i __A) -{ - struct __storeu_si512 { - __m512i_u __v; - } __attribute__((__packed__, __may_alias__)); - ((struct __storeu_si512*)__P)->__v = __A; -} - -static __inline void __DEFAULT_FN_ATTRS512 -_mm512_storeu_epi32 (void *__P, __m512i __A) -{ - struct __storeu_epi32 { - __m512i_u __v; - } __attribute__((__packed__, __may_alias__)); - ((struct __storeu_epi32*)__P)->__v = __A; -} - -static __inline void __DEFAULT_FN_ATTRS512 -_mm512_mask_storeu_epi32(void *__P, __mmask16 __U, __m512i __A) -{ - __builtin_ia32_storedqusi512_mask ((int *)__P, (__v16si) __A, - (__mmask16) __U); -} - -static __inline void __DEFAULT_FN_ATTRS512 -_mm512_mask_storeu_pd(void *__P, __mmask8 __U, __m512d __A) -{ - __builtin_ia32_storeupd512_mask ((double *)__P, (__v8df) __A, (__mmask8) __U); -} - -static __inline void __DEFAULT_FN_ATTRS512 -_mm512_storeu_pd(void *__P, __m512d __A) -{ - struct __storeu_pd { - __m512d_u __v; - } __attribute__((__packed__, __may_alias__)); - ((struct __storeu_pd*)__P)->__v = __A; -} - -static __inline void __DEFAULT_FN_ATTRS512 -_mm512_mask_storeu_ps(void *__P, __mmask16 __U, __m512 __A) -{ - __builtin_ia32_storeups512_mask ((float *)__P, (__v16sf) __A, - (__mmask16) __U); -} - -static __inline void __DEFAULT_FN_ATTRS512 -_mm512_storeu_ps(void *__P, __m512 __A) -{ - struct __storeu_ps { - __m512_u __v; - } __attribute__((__packed__, __may_alias__)); - ((struct __storeu_ps*)__P)->__v = __A; -} - -static __inline void __DEFAULT_FN_ATTRS512 -_mm512_mask_store_pd(void *__P, __mmask8 __U, __m512d __A) -{ - __builtin_ia32_storeapd512_mask ((__v8df *)__P, (__v8df) __A, (__mmask8) __U); -} - -static __inline void __DEFAULT_FN_ATTRS512 -_mm512_store_pd(void *__P, __m512d __A) -{ - *(__m512d*)__P = __A; -} - -static __inline void __DEFAULT_FN_ATTRS512 -_mm512_mask_store_ps(void *__P, __mmask16 __U, __m512 __A) -{ - __builtin_ia32_storeaps512_mask ((__v16sf *)__P, (__v16sf) __A, - (__mmask16) __U); -} - -static __inline void __DEFAULT_FN_ATTRS512 -_mm512_store_ps(void *__P, __m512 __A) -{ - *(__m512*)__P = __A; -} - -static __inline void __DEFAULT_FN_ATTRS512 -_mm512_store_si512 (void *__P, __m512i __A) -{ - *(__m512i *) __P = __A; -} - -static __inline void __DEFAULT_FN_ATTRS512 -_mm512_store_epi32 (void *__P, __m512i __A) -{ - *(__m512i *) __P = __A; -} - -static __inline void __DEFAULT_FN_ATTRS512 -_mm512_store_epi64 (void *__P, __m512i __A) -{ - *(__m512i *) __P = __A; -} - -/* Mask ops */ - -static __inline __mmask16 __DEFAULT_FN_ATTRS -_mm512_knot(__mmask16 __M) -{ - return __builtin_ia32_knothi(__M); -} - -/* Integer compare */ - -#define _mm512_cmpeq_epi32_mask(A, B) \ - _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_EQ) -#define _mm512_mask_cmpeq_epi32_mask(k, A, B) \ - _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_EQ) -#define _mm512_cmpge_epi32_mask(A, B) \ - _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_GE) -#define _mm512_mask_cmpge_epi32_mask(k, A, B) \ - _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GE) -#define _mm512_cmpgt_epi32_mask(A, B) \ - _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_GT) -#define _mm512_mask_cmpgt_epi32_mask(k, A, B) \ - _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GT) -#define _mm512_cmple_epi32_mask(A, B) \ - _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_LE) -#define _mm512_mask_cmple_epi32_mask(k, A, B) \ - _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LE) -#define _mm512_cmplt_epi32_mask(A, B) \ - _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_LT) -#define _mm512_mask_cmplt_epi32_mask(k, A, B) \ - _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LT) -#define _mm512_cmpneq_epi32_mask(A, B) \ - _mm512_cmp_epi32_mask((A), (B), _MM_CMPINT_NE) -#define _mm512_mask_cmpneq_epi32_mask(k, A, B) \ - _mm512_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_NE) - -#define _mm512_cmpeq_epu32_mask(A, B) \ - _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_EQ) -#define _mm512_mask_cmpeq_epu32_mask(k, A, B) \ - _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_EQ) -#define _mm512_cmpge_epu32_mask(A, B) \ - _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_GE) -#define _mm512_mask_cmpge_epu32_mask(k, A, B) \ - _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GE) -#define _mm512_cmpgt_epu32_mask(A, B) \ - _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_GT) -#define _mm512_mask_cmpgt_epu32_mask(k, A, B) \ - _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GT) -#define _mm512_cmple_epu32_mask(A, B) \ - _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_LE) -#define _mm512_mask_cmple_epu32_mask(k, A, B) \ - _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LE) -#define _mm512_cmplt_epu32_mask(A, B) \ - _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_LT) -#define _mm512_mask_cmplt_epu32_mask(k, A, B) \ - _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LT) -#define _mm512_cmpneq_epu32_mask(A, B) \ - _mm512_cmp_epu32_mask((A), (B), _MM_CMPINT_NE) -#define _mm512_mask_cmpneq_epu32_mask(k, A, B) \ - _mm512_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_NE) - -#define _mm512_cmpeq_epi64_mask(A, B) \ - _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_EQ) -#define _mm512_mask_cmpeq_epi64_mask(k, A, B) \ - _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_EQ) -#define _mm512_cmpge_epi64_mask(A, B) \ - _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_GE) -#define _mm512_mask_cmpge_epi64_mask(k, A, B) \ - _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GE) -#define _mm512_cmpgt_epi64_mask(A, B) \ - _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_GT) -#define _mm512_mask_cmpgt_epi64_mask(k, A, B) \ - _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GT) -#define _mm512_cmple_epi64_mask(A, B) \ - _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_LE) -#define _mm512_mask_cmple_epi64_mask(k, A, B) \ - _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LE) -#define _mm512_cmplt_epi64_mask(A, B) \ - _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_LT) -#define _mm512_mask_cmplt_epi64_mask(k, A, B) \ - _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LT) -#define _mm512_cmpneq_epi64_mask(A, B) \ - _mm512_cmp_epi64_mask((A), (B), _MM_CMPINT_NE) -#define _mm512_mask_cmpneq_epi64_mask(k, A, B) \ - _mm512_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_NE) - -#define _mm512_cmpeq_epu64_mask(A, B) \ - _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_EQ) -#define _mm512_mask_cmpeq_epu64_mask(k, A, B) \ - _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_EQ) -#define _mm512_cmpge_epu64_mask(A, B) \ - _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_GE) -#define _mm512_mask_cmpge_epu64_mask(k, A, B) \ - _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GE) -#define _mm512_cmpgt_epu64_mask(A, B) \ - _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_GT) -#define _mm512_mask_cmpgt_epu64_mask(k, A, B) \ - _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GT) -#define _mm512_cmple_epu64_mask(A, B) \ - _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_LE) -#define _mm512_mask_cmple_epu64_mask(k, A, B) \ - _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LE) -#define _mm512_cmplt_epu64_mask(A, B) \ - _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_LT) -#define _mm512_mask_cmplt_epu64_mask(k, A, B) \ - _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LT) -#define _mm512_cmpneq_epu64_mask(A, B) \ - _mm512_cmp_epu64_mask((A), (B), _MM_CMPINT_NE) -#define _mm512_mask_cmpneq_epu64_mask(k, A, B) \ - _mm512_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_NE) - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_cvtepi8_epi32(__m128i __A) -{ - /* This function always performs a signed extension, but __v16qi is a char - which may be signed or unsigned, so use __v16qs. */ - return (__m512i)__builtin_convertvector((__v16qs)__A, __v16si); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtepi8_epi32(__m512i __W, __mmask16 __U, __m128i __A) -{ - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_cvtepi8_epi32(__A), - (__v16si)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtepi8_epi32(__mmask16 __U, __m128i __A) -{ - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_cvtepi8_epi32(__A), - (__v16si)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_cvtepi8_epi64(__m128i __A) -{ - /* This function always performs a signed extension, but __v16qi is a char - which may be signed or unsigned, so use __v16qs. */ - return (__m512i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__A, (__v16qs)__A, 0, 1, 2, 3, 4, 5, 6, 7), __v8di); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtepi8_epi64(__m512i __W, __mmask8 __U, __m128i __A) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_cvtepi8_epi64(__A), - (__v8di)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_cvtepi8_epi64(__A), - (__v8di)_mm512_setzero_si512 ()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_cvtepi32_epi64(__m256i __X) -{ - return (__m512i)__builtin_convertvector((__v8si)__X, __v8di); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtepi32_epi64(__m512i __W, __mmask8 __U, __m256i __X) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_cvtepi32_epi64(__X), - (__v8di)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtepi32_epi64(__mmask8 __U, __m256i __X) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_cvtepi32_epi64(__X), - (__v8di)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_cvtepi16_epi32(__m256i __A) -{ - return (__m512i)__builtin_convertvector((__v16hi)__A, __v16si); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtepi16_epi32(__m512i __W, __mmask16 __U, __m256i __A) -{ - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_cvtepi16_epi32(__A), - (__v16si)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtepi16_epi32(__mmask16 __U, __m256i __A) -{ - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_cvtepi16_epi32(__A), - (__v16si)_mm512_setzero_si512 ()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_cvtepi16_epi64(__m128i __A) -{ - return (__m512i)__builtin_convertvector((__v8hi)__A, __v8di); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtepi16_epi64(__m512i __W, __mmask8 __U, __m128i __A) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_cvtepi16_epi64(__A), - (__v8di)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_cvtepi16_epi64(__A), - (__v8di)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_cvtepu8_epi32(__m128i __A) -{ - return (__m512i)__builtin_convertvector((__v16qu)__A, __v16si); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtepu8_epi32(__m512i __W, __mmask16 __U, __m128i __A) -{ - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_cvtepu8_epi32(__A), - (__v16si)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtepu8_epi32(__mmask16 __U, __m128i __A) -{ - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_cvtepu8_epi32(__A), - (__v16si)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_cvtepu8_epi64(__m128i __A) -{ - return (__m512i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__A, (__v16qu)__A, 0, 1, 2, 3, 4, 5, 6, 7), __v8di); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtepu8_epi64(__m512i __W, __mmask8 __U, __m128i __A) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_cvtepu8_epi64(__A), - (__v8di)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_cvtepu8_epi64(__A), - (__v8di)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_cvtepu32_epi64(__m256i __X) -{ - return (__m512i)__builtin_convertvector((__v8su)__X, __v8di); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtepu32_epi64(__m512i __W, __mmask8 __U, __m256i __X) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_cvtepu32_epi64(__X), - (__v8di)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtepu32_epi64(__mmask8 __U, __m256i __X) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_cvtepu32_epi64(__X), - (__v8di)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_cvtepu16_epi32(__m256i __A) -{ - return (__m512i)__builtin_convertvector((__v16hu)__A, __v16si); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtepu16_epi32(__m512i __W, __mmask16 __U, __m256i __A) -{ - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_cvtepu16_epi32(__A), - (__v16si)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtepu16_epi32(__mmask16 __U, __m256i __A) -{ - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_cvtepu16_epi32(__A), - (__v16si)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_cvtepu16_epi64(__m128i __A) -{ - return (__m512i)__builtin_convertvector((__v8hu)__A, __v8di); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtepu16_epi64(__m512i __W, __mmask8 __U, __m128i __A) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_cvtepu16_epi64(__A), - (__v8di)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_cvtepu16_epi64(__A), - (__v8di)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_rorv_epi32 (__m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_prorvd512((__v16si)__A, (__v16si)__B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_rorv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectd_512(__U, - (__v16si)_mm512_rorv_epi32(__A, __B), - (__v16si)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_rorv_epi32 (__mmask16 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectd_512(__U, - (__v16si)_mm512_rorv_epi32(__A, __B), - (__v16si)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_rorv_epi64 (__m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_prorvq512((__v8di)__A, (__v8di)__B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_rorv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectq_512(__U, - (__v8di)_mm512_rorv_epi64(__A, __B), - (__v8di)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_rorv_epi64 (__mmask8 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectq_512(__U, - (__v8di)_mm512_rorv_epi64(__A, __B), - (__v8di)_mm512_setzero_si512()); -} - - - -#define _mm512_cmp_epi32_mask(a, b, p) \ - ((__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \ - (__v16si)(__m512i)(b), (int)(p), \ - (__mmask16)-1)) - -#define _mm512_cmp_epu32_mask(a, b, p) \ - ((__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)(__m512i)(a), \ - (__v16si)(__m512i)(b), (int)(p), \ - (__mmask16)-1)) - -#define _mm512_cmp_epi64_mask(a, b, p) \ - ((__mmask8)__builtin_ia32_cmpq512_mask((__v8di)(__m512i)(a), \ - (__v8di)(__m512i)(b), (int)(p), \ - (__mmask8)-1)) - -#define _mm512_cmp_epu64_mask(a, b, p) \ - ((__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)(__m512i)(a), \ - (__v8di)(__m512i)(b), (int)(p), \ - (__mmask8)-1)) - -#define _mm512_mask_cmp_epi32_mask(m, a, b, p) \ - ((__mmask16)__builtin_ia32_cmpd512_mask((__v16si)(__m512i)(a), \ - (__v16si)(__m512i)(b), (int)(p), \ - (__mmask16)(m))) - -#define _mm512_mask_cmp_epu32_mask(m, a, b, p) \ - ((__mmask16)__builtin_ia32_ucmpd512_mask((__v16si)(__m512i)(a), \ - (__v16si)(__m512i)(b), (int)(p), \ - (__mmask16)(m))) - -#define _mm512_mask_cmp_epi64_mask(m, a, b, p) \ - ((__mmask8)__builtin_ia32_cmpq512_mask((__v8di)(__m512i)(a), \ - (__v8di)(__m512i)(b), (int)(p), \ - (__mmask8)(m))) - -#define _mm512_mask_cmp_epu64_mask(m, a, b, p) \ - ((__mmask8)__builtin_ia32_ucmpq512_mask((__v8di)(__m512i)(a), \ - (__v8di)(__m512i)(b), (int)(p), \ - (__mmask8)(m))) - -#define _mm512_rol_epi32(a, b) \ - ((__m512i)__builtin_ia32_prold512((__v16si)(__m512i)(a), (int)(b))) - -#define _mm512_mask_rol_epi32(W, U, a, b) \ - ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ - (__v16si)_mm512_rol_epi32((a), (b)), \ - (__v16si)(__m512i)(W))) - -#define _mm512_maskz_rol_epi32(U, a, b) \ - ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ - (__v16si)_mm512_rol_epi32((a), (b)), \ - (__v16si)_mm512_setzero_si512())) - -#define _mm512_rol_epi64(a, b) \ - ((__m512i)__builtin_ia32_prolq512((__v8di)(__m512i)(a), (int)(b))) - -#define _mm512_mask_rol_epi64(W, U, a, b) \ - ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ - (__v8di)_mm512_rol_epi64((a), (b)), \ - (__v8di)(__m512i)(W))) - -#define _mm512_maskz_rol_epi64(U, a, b) \ - ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ - (__v8di)_mm512_rol_epi64((a), (b)), \ - (__v8di)_mm512_setzero_si512())) - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_rolv_epi32 (__m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_prolvd512((__v16si)__A, (__v16si)__B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_rolv_epi32 (__m512i __W, __mmask16 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectd_512(__U, - (__v16si)_mm512_rolv_epi32(__A, __B), - (__v16si)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_rolv_epi32 (__mmask16 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectd_512(__U, - (__v16si)_mm512_rolv_epi32(__A, __B), - (__v16si)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_rolv_epi64 (__m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_prolvq512((__v8di)__A, (__v8di)__B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_rolv_epi64 (__m512i __W, __mmask8 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectq_512(__U, - (__v8di)_mm512_rolv_epi64(__A, __B), - (__v8di)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_rolv_epi64 (__mmask8 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectq_512(__U, - (__v8di)_mm512_rolv_epi64(__A, __B), - (__v8di)_mm512_setzero_si512()); -} - -#define _mm512_ror_epi32(A, B) \ - ((__m512i)__builtin_ia32_prord512((__v16si)(__m512i)(A), (int)(B))) - -#define _mm512_mask_ror_epi32(W, U, A, B) \ - ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ - (__v16si)_mm512_ror_epi32((A), (B)), \ - (__v16si)(__m512i)(W))) - -#define _mm512_maskz_ror_epi32(U, A, B) \ - ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ - (__v16si)_mm512_ror_epi32((A), (B)), \ - (__v16si)_mm512_setzero_si512())) - -#define _mm512_ror_epi64(A, B) \ - ((__m512i)__builtin_ia32_prorq512((__v8di)(__m512i)(A), (int)(B))) - -#define _mm512_mask_ror_epi64(W, U, A, B) \ - ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ - (__v8di)_mm512_ror_epi64((A), (B)), \ - (__v8di)(__m512i)(W))) - -#define _mm512_maskz_ror_epi64(U, A, B) \ - ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ - (__v8di)_mm512_ror_epi64((A), (B)), \ - (__v8di)_mm512_setzero_si512())) - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_slli_epi32(__m512i __A, unsigned int __B) -{ - return (__m512i)__builtin_ia32_pslldi512((__v16si)__A, __B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_slli_epi32(__m512i __W, __mmask16 __U, __m512i __A, - unsigned int __B) -{ - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_slli_epi32(__A, __B), - (__v16si)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_slli_epi32(__mmask16 __U, __m512i __A, unsigned int __B) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_slli_epi32(__A, __B), - (__v16si)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_slli_epi64(__m512i __A, unsigned int __B) -{ - return (__m512i)__builtin_ia32_psllqi512((__v8di)__A, __B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_slli_epi64(__m512i __W, __mmask8 __U, __m512i __A, unsigned int __B) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_slli_epi64(__A, __B), - (__v8di)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_slli_epi64(__mmask8 __U, __m512i __A, unsigned int __B) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_slli_epi64(__A, __B), - (__v8di)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_srli_epi32(__m512i __A, unsigned int __B) -{ - return (__m512i)__builtin_ia32_psrldi512((__v16si)__A, __B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_srli_epi32(__m512i __W, __mmask16 __U, __m512i __A, - unsigned int __B) -{ - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_srli_epi32(__A, __B), - (__v16si)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_srli_epi32(__mmask16 __U, __m512i __A, unsigned int __B) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_srli_epi32(__A, __B), - (__v16si)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_srli_epi64(__m512i __A, unsigned int __B) -{ - return (__m512i)__builtin_ia32_psrlqi512((__v8di)__A, __B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_srli_epi64(__m512i __W, __mmask8 __U, __m512i __A, - unsigned int __B) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_srli_epi64(__A, __B), - (__v8di)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_srli_epi64(__mmask8 __U, __m512i __A, - unsigned int __B) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_srli_epi64(__A, __B), - (__v8di)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_load_epi32 (__m512i __W, __mmask16 __U, void const *__P) -{ - return (__m512i) __builtin_ia32_movdqa32load512_mask ((const __v16si *) __P, - (__v16si) __W, - (__mmask16) __U); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_load_epi32 (__mmask16 __U, void const *__P) -{ - return (__m512i) __builtin_ia32_movdqa32load512_mask ((const __v16si *) __P, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) __U); -} - -static __inline__ void __DEFAULT_FN_ATTRS512 -_mm512_mask_store_epi32 (void *__P, __mmask16 __U, __m512i __A) -{ - __builtin_ia32_movdqa32store512_mask ((__v16si *) __P, (__v16si) __A, - (__mmask16) __U); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_mov_epi32 (__m512i __W, __mmask16 __U, __m512i __A) -{ - return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U, - (__v16si) __A, - (__v16si) __W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_mov_epi32 (__mmask16 __U, __m512i __A) -{ - return (__m512i) __builtin_ia32_selectd_512 ((__mmask16) __U, - (__v16si) __A, - (__v16si) _mm512_setzero_si512 ()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_mov_epi64 (__m512i __W, __mmask8 __U, __m512i __A) -{ - return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U, - (__v8di) __A, - (__v8di) __W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_mov_epi64 (__mmask8 __U, __m512i __A) -{ - return (__m512i) __builtin_ia32_selectq_512 ((__mmask8) __U, - (__v8di) __A, - (__v8di) _mm512_setzero_si512 ()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_load_epi64 (__m512i __W, __mmask8 __U, void const *__P) -{ - return (__m512i) __builtin_ia32_movdqa64load512_mask ((const __v8di *) __P, - (__v8di) __W, - (__mmask8) __U); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_load_epi64 (__mmask8 __U, void const *__P) -{ - return (__m512i) __builtin_ia32_movdqa64load512_mask ((const __v8di *) __P, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) __U); -} - -static __inline__ void __DEFAULT_FN_ATTRS512 -_mm512_mask_store_epi64 (void *__P, __mmask8 __U, __m512i __A) -{ - __builtin_ia32_movdqa64store512_mask ((__v8di *) __P, (__v8di) __A, - (__mmask8) __U); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_movedup_pd (__m512d __A) -{ - return (__m512d)__builtin_shufflevector((__v8df)__A, (__v8df)__A, - 0, 0, 2, 2, 4, 4, 6, 6); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_movedup_pd (__m512d __W, __mmask8 __U, __m512d __A) -{ - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, - (__v8df)_mm512_movedup_pd(__A), - (__v8df)__W); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_maskz_movedup_pd (__mmask8 __U, __m512d __A) -{ - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, - (__v8df)_mm512_movedup_pd(__A), - (__v8df)_mm512_setzero_pd()); -} - -#define _mm512_fixupimm_round_pd(A, B, C, imm, R) \ - ((__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8di)(__m512i)(C), (int)(imm), \ - (__mmask8)-1, (int)(R))) - -#define _mm512_mask_fixupimm_round_pd(A, U, B, C, imm, R) \ - ((__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8di)(__m512i)(C), (int)(imm), \ - (__mmask8)(U), (int)(R))) - -#define _mm512_fixupimm_pd(A, B, C, imm) \ - ((__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8di)(__m512i)(C), (int)(imm), \ - (__mmask8)-1, \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_mask_fixupimm_pd(A, U, B, C, imm) \ - ((__m512d)__builtin_ia32_fixupimmpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8di)(__m512i)(C), (int)(imm), \ - (__mmask8)(U), \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_maskz_fixupimm_round_pd(U, A, B, C, imm, R) \ - ((__m512d)__builtin_ia32_fixupimmpd512_maskz((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8di)(__m512i)(C), \ - (int)(imm), (__mmask8)(U), \ - (int)(R))) - -#define _mm512_maskz_fixupimm_pd(U, A, B, C, imm) \ - ((__m512d)__builtin_ia32_fixupimmpd512_maskz((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8di)(__m512i)(C), \ - (int)(imm), (__mmask8)(U), \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_fixupimm_round_ps(A, B, C, imm, R) \ - ((__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16si)(__m512i)(C), (int)(imm), \ - (__mmask16)-1, (int)(R))) - -#define _mm512_mask_fixupimm_round_ps(A, U, B, C, imm, R) \ - ((__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16si)(__m512i)(C), (int)(imm), \ - (__mmask16)(U), (int)(R))) - -#define _mm512_fixupimm_ps(A, B, C, imm) \ - ((__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16si)(__m512i)(C), (int)(imm), \ - (__mmask16)-1, \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_mask_fixupimm_ps(A, U, B, C, imm) \ - ((__m512)__builtin_ia32_fixupimmps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16si)(__m512i)(C), (int)(imm), \ - (__mmask16)(U), \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_maskz_fixupimm_round_ps(U, A, B, C, imm, R) \ - ((__m512)__builtin_ia32_fixupimmps512_maskz((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16si)(__m512i)(C), \ - (int)(imm), (__mmask16)(U), \ - (int)(R))) - -#define _mm512_maskz_fixupimm_ps(U, A, B, C, imm) \ - ((__m512)__builtin_ia32_fixupimmps512_maskz((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16si)(__m512i)(C), \ - (int)(imm), (__mmask16)(U), \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm_fixupimm_round_sd(A, B, C, imm, R) \ - ((__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2di)(__m128i)(C), (int)(imm), \ - (__mmask8)-1, (int)(R))) - -#define _mm_mask_fixupimm_round_sd(A, U, B, C, imm, R) \ - ((__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2di)(__m128i)(C), (int)(imm), \ - (__mmask8)(U), (int)(R))) - -#define _mm_fixupimm_sd(A, B, C, imm) \ - ((__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2di)(__m128i)(C), (int)(imm), \ - (__mmask8)-1, \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm_mask_fixupimm_sd(A, U, B, C, imm) \ - ((__m128d)__builtin_ia32_fixupimmsd_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2di)(__m128i)(C), (int)(imm), \ - (__mmask8)(U), \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm_maskz_fixupimm_round_sd(U, A, B, C, imm, R) \ - ((__m128d)__builtin_ia32_fixupimmsd_maskz((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2di)(__m128i)(C), (int)(imm), \ - (__mmask8)(U), (int)(R))) - -#define _mm_maskz_fixupimm_sd(U, A, B, C, imm) \ - ((__m128d)__builtin_ia32_fixupimmsd_maskz((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2di)(__m128i)(C), (int)(imm), \ - (__mmask8)(U), \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm_fixupimm_round_ss(A, B, C, imm, R) \ - ((__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4si)(__m128i)(C), (int)(imm), \ - (__mmask8)-1, (int)(R))) - -#define _mm_mask_fixupimm_round_ss(A, U, B, C, imm, R) \ - ((__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4si)(__m128i)(C), (int)(imm), \ - (__mmask8)(U), (int)(R))) - -#define _mm_fixupimm_ss(A, B, C, imm) \ - ((__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4si)(__m128i)(C), (int)(imm), \ - (__mmask8)-1, \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm_mask_fixupimm_ss(A, U, B, C, imm) \ - ((__m128)__builtin_ia32_fixupimmss_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4si)(__m128i)(C), (int)(imm), \ - (__mmask8)(U), \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm_maskz_fixupimm_round_ss(U, A, B, C, imm, R) \ - ((__m128)__builtin_ia32_fixupimmss_maskz((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4si)(__m128i)(C), (int)(imm), \ - (__mmask8)(U), (int)(R))) - -#define _mm_maskz_fixupimm_ss(U, A, B, C, imm) \ - ((__m128)__builtin_ia32_fixupimmss_maskz((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4si)(__m128i)(C), (int)(imm), \ - (__mmask8)(U), \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm_getexp_round_sd(A, B, R) \ - ((__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1, (int)(R))) - - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_getexp_sd (__m128d __A, __m128d __B) -{ - return (__m128d) __builtin_ia32_getexpsd128_round_mask ((__v2df) __A, - (__v2df) __B, (__v2df) _mm_setzero_pd(), (__mmask8) -1, _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_getexp_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) -{ - return (__m128d) __builtin_ia32_getexpsd128_round_mask ( (__v2df) __A, - (__v2df) __B, - (__v2df) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_mask_getexp_round_sd(W, U, A, B, R) \ - ((__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)(__m128d)(W), \ - (__mmask8)(U), (int)(R))) - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_getexp_sd (__mmask8 __U, __m128d __A, __m128d __B) -{ - return (__m128d) __builtin_ia32_getexpsd128_round_mask ( (__v2df) __A, - (__v2df) __B, - (__v2df) _mm_setzero_pd (), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_maskz_getexp_round_sd(U, A, B, R) \ - ((__m128d)__builtin_ia32_getexpsd128_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U), (int)(R))) - -#define _mm_getexp_round_ss(A, B, R) \ - ((__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)-1, (int)(R))) - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_getexp_ss (__m128 __A, __m128 __B) -{ - return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A, - (__v4sf) __B, (__v4sf) _mm_setzero_ps(), (__mmask8) -1, _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_getexp_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) -{ - return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_mask_getexp_round_ss(W, U, A, B, R) \ - ((__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)(__m128)(W), \ - (__mmask8)(U), (int)(R))) - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_getexp_ss (__mmask8 __U, __m128 __A, __m128 __B) -{ - return (__m128) __builtin_ia32_getexpss128_round_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) _mm_setzero_ps (), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_maskz_getexp_round_ss(U, A, B, R) \ - ((__m128)__builtin_ia32_getexpss128_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U), (int)(R))) - -#define _mm_getmant_round_sd(A, B, C, D, R) \ - ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (int)(((D)<<2) | (C)), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1, (int)(R))) - -#define _mm_getmant_sd(A, B, C, D) \ - ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (int)(((D)<<2) | (C)), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1, \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm_mask_getmant_sd(W, U, A, B, C, D) \ - ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (int)(((D)<<2) | (C)), \ - (__v2df)(__m128d)(W), \ - (__mmask8)(U), \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm_mask_getmant_round_sd(W, U, A, B, C, D, R) \ - ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (int)(((D)<<2) | (C)), \ - (__v2df)(__m128d)(W), \ - (__mmask8)(U), (int)(R))) - -#define _mm_maskz_getmant_sd(U, A, B, C, D) \ - ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (int)(((D)<<2) | (C)), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U), \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm_maskz_getmant_round_sd(U, A, B, C, D, R) \ - ((__m128d)__builtin_ia32_getmantsd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (int)(((D)<<2) | (C)), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U), (int)(R))) - -#define _mm_getmant_round_ss(A, B, C, D, R) \ - ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (int)(((D)<<2) | (C)), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)-1, (int)(R))) - -#define _mm_getmant_ss(A, B, C, D) \ - ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (int)(((D)<<2) | (C)), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)-1, \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm_mask_getmant_ss(W, U, A, B, C, D) \ - ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (int)(((D)<<2) | (C)), \ - (__v4sf)(__m128)(W), \ - (__mmask8)(U), \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm_mask_getmant_round_ss(W, U, A, B, C, D, R) \ - ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (int)(((D)<<2) | (C)), \ - (__v4sf)(__m128)(W), \ - (__mmask8)(U), (int)(R))) - -#define _mm_maskz_getmant_ss(U, A, B, C, D) \ - ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (int)(((D)<<2) | (C)), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U), \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm_maskz_getmant_round_ss(U, A, B, C, D, R) \ - ((__m128)__builtin_ia32_getmantss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (int)(((D)<<2) | (C)), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U), (int)(R))) - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm512_kmov (__mmask16 __A) -{ - return __A; -} - -#define _mm_comi_round_sd(A, B, P, R) \ - ((int)__builtin_ia32_vcomisd((__v2df)(__m128d)(A), (__v2df)(__m128d)(B), \ - (int)(P), (int)(R))) - -#define _mm_comi_round_ss(A, B, P, R) \ - ((int)__builtin_ia32_vcomiss((__v4sf)(__m128)(A), (__v4sf)(__m128)(B), \ - (int)(P), (int)(R))) - -#ifdef __x86_64__ -#define _mm_cvt_roundsd_si64(A, R) \ - ((long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R))) -#endif - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_sll_epi32(__m512i __A, __m128i __B) -{ - return (__m512i)__builtin_ia32_pslld512((__v16si) __A, (__v4si)__B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_sll_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) -{ - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_sll_epi32(__A, __B), - (__v16si)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_sll_epi32(__mmask16 __U, __m512i __A, __m128i __B) -{ - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_sll_epi32(__A, __B), - (__v16si)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_sll_epi64(__m512i __A, __m128i __B) -{ - return (__m512i)__builtin_ia32_psllq512((__v8di)__A, (__v2di)__B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_sll_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_sll_epi64(__A, __B), - (__v8di)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_sll_epi64(__mmask8 __U, __m512i __A, __m128i __B) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_sll_epi64(__A, __B), - (__v8di)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_sllv_epi32(__m512i __X, __m512i __Y) -{ - return (__m512i)__builtin_ia32_psllv16si((__v16si)__X, (__v16si)__Y); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_sllv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) -{ - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_sllv_epi32(__X, __Y), - (__v16si)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_sllv_epi32(__mmask16 __U, __m512i __X, __m512i __Y) -{ - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_sllv_epi32(__X, __Y), - (__v16si)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_sllv_epi64(__m512i __X, __m512i __Y) -{ - return (__m512i)__builtin_ia32_psllv8di((__v8di)__X, (__v8di)__Y); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_sllv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_sllv_epi64(__X, __Y), - (__v8di)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_sllv_epi64(__mmask8 __U, __m512i __X, __m512i __Y) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_sllv_epi64(__X, __Y), - (__v8di)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_sra_epi32(__m512i __A, __m128i __B) -{ - return (__m512i)__builtin_ia32_psrad512((__v16si) __A, (__v4si)__B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_sra_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) -{ - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_sra_epi32(__A, __B), - (__v16si)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_sra_epi32(__mmask16 __U, __m512i __A, __m128i __B) -{ - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_sra_epi32(__A, __B), - (__v16si)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_sra_epi64(__m512i __A, __m128i __B) -{ - return (__m512i)__builtin_ia32_psraq512((__v8di)__A, (__v2di)__B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_sra_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_sra_epi64(__A, __B), - (__v8di)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_sra_epi64(__mmask8 __U, __m512i __A, __m128i __B) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_sra_epi64(__A, __B), - (__v8di)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_srav_epi32(__m512i __X, __m512i __Y) -{ - return (__m512i)__builtin_ia32_psrav16si((__v16si)__X, (__v16si)__Y); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_srav_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) -{ - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_srav_epi32(__X, __Y), - (__v16si)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_srav_epi32(__mmask16 __U, __m512i __X, __m512i __Y) -{ - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_srav_epi32(__X, __Y), - (__v16si)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_srav_epi64(__m512i __X, __m512i __Y) -{ - return (__m512i)__builtin_ia32_psrav8di((__v8di)__X, (__v8di)__Y); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_srav_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_srav_epi64(__X, __Y), - (__v8di)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_srav_epi64(__mmask8 __U, __m512i __X, __m512i __Y) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_srav_epi64(__X, __Y), - (__v8di)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_srl_epi32(__m512i __A, __m128i __B) -{ - return (__m512i)__builtin_ia32_psrld512((__v16si) __A, (__v4si)__B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_srl_epi32(__m512i __W, __mmask16 __U, __m512i __A, __m128i __B) -{ - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_srl_epi32(__A, __B), - (__v16si)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_srl_epi32(__mmask16 __U, __m512i __A, __m128i __B) -{ - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_srl_epi32(__A, __B), - (__v16si)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_srl_epi64(__m512i __A, __m128i __B) -{ - return (__m512i)__builtin_ia32_psrlq512((__v8di)__A, (__v2di)__B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_srl_epi64(__m512i __W, __mmask8 __U, __m512i __A, __m128i __B) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_srl_epi64(__A, __B), - (__v8di)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_srl_epi64(__mmask8 __U, __m512i __A, __m128i __B) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_srl_epi64(__A, __B), - (__v8di)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_srlv_epi32(__m512i __X, __m512i __Y) -{ - return (__m512i)__builtin_ia32_psrlv16si((__v16si)__X, (__v16si)__Y); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_srlv_epi32(__m512i __W, __mmask16 __U, __m512i __X, __m512i __Y) -{ - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_srlv_epi32(__X, __Y), - (__v16si)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_srlv_epi32(__mmask16 __U, __m512i __X, __m512i __Y) -{ - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_srlv_epi32(__X, __Y), - (__v16si)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_srlv_epi64 (__m512i __X, __m512i __Y) -{ - return (__m512i)__builtin_ia32_psrlv8di((__v8di)__X, (__v8di)__Y); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_srlv_epi64(__m512i __W, __mmask8 __U, __m512i __X, __m512i __Y) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_srlv_epi64(__X, __Y), - (__v8di)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_srlv_epi64(__mmask8 __U, __m512i __X, __m512i __Y) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_srlv_epi64(__X, __Y), - (__v8di)_mm512_setzero_si512()); -} - -#define _mm512_ternarylogic_epi32(A, B, C, imm) \ - ((__m512i)__builtin_ia32_pternlogd512_mask((__v16si)(__m512i)(A), \ - (__v16si)(__m512i)(B), \ - (__v16si)(__m512i)(C), (int)(imm), \ - (__mmask16)-1)) - -#define _mm512_mask_ternarylogic_epi32(A, U, B, C, imm) \ - ((__m512i)__builtin_ia32_pternlogd512_mask((__v16si)(__m512i)(A), \ - (__v16si)(__m512i)(B), \ - (__v16si)(__m512i)(C), (int)(imm), \ - (__mmask16)(U))) - -#define _mm512_maskz_ternarylogic_epi32(U, A, B, C, imm) \ - ((__m512i)__builtin_ia32_pternlogd512_maskz((__v16si)(__m512i)(A), \ - (__v16si)(__m512i)(B), \ - (__v16si)(__m512i)(C), \ - (int)(imm), (__mmask16)(U))) - -#define _mm512_ternarylogic_epi64(A, B, C, imm) \ - ((__m512i)__builtin_ia32_pternlogq512_mask((__v8di)(__m512i)(A), \ - (__v8di)(__m512i)(B), \ - (__v8di)(__m512i)(C), (int)(imm), \ - (__mmask8)-1)) - -#define _mm512_mask_ternarylogic_epi64(A, U, B, C, imm) \ - ((__m512i)__builtin_ia32_pternlogq512_mask((__v8di)(__m512i)(A), \ - (__v8di)(__m512i)(B), \ - (__v8di)(__m512i)(C), (int)(imm), \ - (__mmask8)(U))) - -#define _mm512_maskz_ternarylogic_epi64(U, A, B, C, imm) \ - ((__m512i)__builtin_ia32_pternlogq512_maskz((__v8di)(__m512i)(A), \ - (__v8di)(__m512i)(B), \ - (__v8di)(__m512i)(C), (int)(imm), \ - (__mmask8)(U))) - -#ifdef __x86_64__ -#define _mm_cvt_roundsd_i64(A, R) \ - ((long long)__builtin_ia32_vcvtsd2si64((__v2df)(__m128d)(A), (int)(R))) -#endif - -#define _mm_cvt_roundsd_si32(A, R) \ - ((int)__builtin_ia32_vcvtsd2si32((__v2df)(__m128d)(A), (int)(R))) - -#define _mm_cvt_roundsd_i32(A, R) \ - ((int)__builtin_ia32_vcvtsd2si32((__v2df)(__m128d)(A), (int)(R))) - -#define _mm_cvt_roundsd_u32(A, R) \ - ((unsigned int)__builtin_ia32_vcvtsd2usi32((__v2df)(__m128d)(A), (int)(R))) - -static __inline__ unsigned __DEFAULT_FN_ATTRS128 -_mm_cvtsd_u32 (__m128d __A) -{ - return (unsigned) __builtin_ia32_vcvtsd2usi32 ((__v2df) __A, - _MM_FROUND_CUR_DIRECTION); -} - -#ifdef __x86_64__ -#define _mm_cvt_roundsd_u64(A, R) \ - ((unsigned long long)__builtin_ia32_vcvtsd2usi64((__v2df)(__m128d)(A), \ - (int)(R))) - -static __inline__ unsigned long long __DEFAULT_FN_ATTRS128 -_mm_cvtsd_u64 (__m128d __A) -{ - return (unsigned long long) __builtin_ia32_vcvtsd2usi64 ((__v2df) - __A, - _MM_FROUND_CUR_DIRECTION); -} -#endif - -#define _mm_cvt_roundss_si32(A, R) \ - ((int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R))) - -#define _mm_cvt_roundss_i32(A, R) \ - ((int)__builtin_ia32_vcvtss2si32((__v4sf)(__m128)(A), (int)(R))) - -#ifdef __x86_64__ -#define _mm_cvt_roundss_si64(A, R) \ - ((long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R))) - -#define _mm_cvt_roundss_i64(A, R) \ - ((long long)__builtin_ia32_vcvtss2si64((__v4sf)(__m128)(A), (int)(R))) -#endif - -#define _mm_cvt_roundss_u32(A, R) \ - ((unsigned int)__builtin_ia32_vcvtss2usi32((__v4sf)(__m128)(A), (int)(R))) - -static __inline__ unsigned __DEFAULT_FN_ATTRS128 -_mm_cvtss_u32 (__m128 __A) -{ - return (unsigned) __builtin_ia32_vcvtss2usi32 ((__v4sf) __A, - _MM_FROUND_CUR_DIRECTION); -} - -#ifdef __x86_64__ -#define _mm_cvt_roundss_u64(A, R) \ - ((unsigned long long)__builtin_ia32_vcvtss2usi64((__v4sf)(__m128)(A), \ - (int)(R))) - -static __inline__ unsigned long long __DEFAULT_FN_ATTRS128 -_mm_cvtss_u64 (__m128 __A) -{ - return (unsigned long long) __builtin_ia32_vcvtss2usi64 ((__v4sf) - __A, - _MM_FROUND_CUR_DIRECTION); -} -#endif - -#define _mm_cvtt_roundsd_i32(A, R) \ - ((int)__builtin_ia32_vcvttsd2si32((__v2df)(__m128d)(A), (int)(R))) - -#define _mm_cvtt_roundsd_si32(A, R) \ - ((int)__builtin_ia32_vcvttsd2si32((__v2df)(__m128d)(A), (int)(R))) - -static __inline__ int __DEFAULT_FN_ATTRS128 -_mm_cvttsd_i32 (__m128d __A) -{ - return (int) __builtin_ia32_vcvttsd2si32 ((__v2df) __A, - _MM_FROUND_CUR_DIRECTION); -} - -#ifdef __x86_64__ -#define _mm_cvtt_roundsd_si64(A, R) \ - ((long long)__builtin_ia32_vcvttsd2si64((__v2df)(__m128d)(A), (int)(R))) - -#define _mm_cvtt_roundsd_i64(A, R) \ - ((long long)__builtin_ia32_vcvttsd2si64((__v2df)(__m128d)(A), (int)(R))) - -static __inline__ long long __DEFAULT_FN_ATTRS128 -_mm_cvttsd_i64 (__m128d __A) -{ - return (long long) __builtin_ia32_vcvttsd2si64 ((__v2df) __A, - _MM_FROUND_CUR_DIRECTION); -} -#endif - -#define _mm_cvtt_roundsd_u32(A, R) \ - ((unsigned int)__builtin_ia32_vcvttsd2usi32((__v2df)(__m128d)(A), (int)(R))) - -static __inline__ unsigned __DEFAULT_FN_ATTRS128 -_mm_cvttsd_u32 (__m128d __A) -{ - return (unsigned) __builtin_ia32_vcvttsd2usi32 ((__v2df) __A, - _MM_FROUND_CUR_DIRECTION); -} - -#ifdef __x86_64__ -#define _mm_cvtt_roundsd_u64(A, R) \ - ((unsigned long long)__builtin_ia32_vcvttsd2usi64((__v2df)(__m128d)(A), \ - (int)(R))) - -static __inline__ unsigned long long __DEFAULT_FN_ATTRS128 -_mm_cvttsd_u64 (__m128d __A) -{ - return (unsigned long long) __builtin_ia32_vcvttsd2usi64 ((__v2df) - __A, - _MM_FROUND_CUR_DIRECTION); -} -#endif - -#define _mm_cvtt_roundss_i32(A, R) \ - ((int)__builtin_ia32_vcvttss2si32((__v4sf)(__m128)(A), (int)(R))) - -#define _mm_cvtt_roundss_si32(A, R) \ - ((int)__builtin_ia32_vcvttss2si32((__v4sf)(__m128)(A), (int)(R))) - -static __inline__ int __DEFAULT_FN_ATTRS128 -_mm_cvttss_i32 (__m128 __A) -{ - return (int) __builtin_ia32_vcvttss2si32 ((__v4sf) __A, - _MM_FROUND_CUR_DIRECTION); -} - -#ifdef __x86_64__ -#define _mm_cvtt_roundss_i64(A, R) \ - ((long long)__builtin_ia32_vcvttss2si64((__v4sf)(__m128)(A), (int)(R))) - -#define _mm_cvtt_roundss_si64(A, R) \ - ((long long)__builtin_ia32_vcvttss2si64((__v4sf)(__m128)(A), (int)(R))) - -static __inline__ long long __DEFAULT_FN_ATTRS128 -_mm_cvttss_i64 (__m128 __A) -{ - return (long long) __builtin_ia32_vcvttss2si64 ((__v4sf) __A, - _MM_FROUND_CUR_DIRECTION); -} -#endif - -#define _mm_cvtt_roundss_u32(A, R) \ - ((unsigned int)__builtin_ia32_vcvttss2usi32((__v4sf)(__m128)(A), (int)(R))) - -static __inline__ unsigned __DEFAULT_FN_ATTRS128 -_mm_cvttss_u32 (__m128 __A) -{ - return (unsigned) __builtin_ia32_vcvttss2usi32 ((__v4sf) __A, - _MM_FROUND_CUR_DIRECTION); -} - -#ifdef __x86_64__ -#define _mm_cvtt_roundss_u64(A, R) \ - ((unsigned long long)__builtin_ia32_vcvttss2usi64((__v4sf)(__m128)(A), \ - (int)(R))) - -static __inline__ unsigned long long __DEFAULT_FN_ATTRS128 -_mm_cvttss_u64 (__m128 __A) -{ - return (unsigned long long) __builtin_ia32_vcvttss2usi64 ((__v4sf) - __A, - _MM_FROUND_CUR_DIRECTION); -} -#endif - -#define _mm512_permute_pd(X, C) \ - ((__m512d)__builtin_ia32_vpermilpd512((__v8df)(__m512d)(X), (int)(C))) - -#define _mm512_mask_permute_pd(W, U, X, C) \ - ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_permute_pd((X), (C)), \ - (__v8df)(__m512d)(W))) - -#define _mm512_maskz_permute_pd(U, X, C) \ - ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_permute_pd((X), (C)), \ - (__v8df)_mm512_setzero_pd())) - -#define _mm512_permute_ps(X, C) \ - ((__m512)__builtin_ia32_vpermilps512((__v16sf)(__m512)(X), (int)(C))) - -#define _mm512_mask_permute_ps(W, U, X, C) \ - ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - (__v16sf)_mm512_permute_ps((X), (C)), \ - (__v16sf)(__m512)(W))) - -#define _mm512_maskz_permute_ps(U, X, C) \ - ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - (__v16sf)_mm512_permute_ps((X), (C)), \ - (__v16sf)_mm512_setzero_ps())) - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_permutevar_pd(__m512d __A, __m512i __C) -{ - return (__m512d)__builtin_ia32_vpermilvarpd512((__v8df)__A, (__v8di)__C); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_permutevar_pd(__m512d __W, __mmask8 __U, __m512d __A, __m512i __C) -{ - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, - (__v8df)_mm512_permutevar_pd(__A, __C), - (__v8df)__W); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_maskz_permutevar_pd(__mmask8 __U, __m512d __A, __m512i __C) -{ - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, - (__v8df)_mm512_permutevar_pd(__A, __C), - (__v8df)_mm512_setzero_pd()); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_permutevar_ps(__m512 __A, __m512i __C) -{ - return (__m512)__builtin_ia32_vpermilvarps512((__v16sf)__A, (__v16si)__C); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_permutevar_ps(__m512 __W, __mmask16 __U, __m512 __A, __m512i __C) -{ - return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_permutevar_ps(__A, __C), - (__v16sf)__W); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_maskz_permutevar_ps(__mmask16 __U, __m512 __A, __m512i __C) -{ - return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_permutevar_ps(__A, __C), - (__v16sf)_mm512_setzero_ps()); -} - -static __inline __m512d __DEFAULT_FN_ATTRS512 -_mm512_permutex2var_pd(__m512d __A, __m512i __I, __m512d __B) -{ - return (__m512d)__builtin_ia32_vpermi2varpd512((__v8df)__A, (__v8di)__I, - (__v8df)__B); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_permutex2var_pd(__m512d __A, __mmask8 __U, __m512i __I, __m512d __B) -{ - return (__m512d)__builtin_ia32_selectpd_512(__U, - (__v8df)_mm512_permutex2var_pd(__A, __I, __B), - (__v8df)__A); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask2_permutex2var_pd(__m512d __A, __m512i __I, __mmask8 __U, - __m512d __B) -{ - return (__m512d)__builtin_ia32_selectpd_512(__U, - (__v8df)_mm512_permutex2var_pd(__A, __I, __B), - (__v8df)(__m512d)__I); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_maskz_permutex2var_pd(__mmask8 __U, __m512d __A, __m512i __I, - __m512d __B) -{ - return (__m512d)__builtin_ia32_selectpd_512(__U, - (__v8df)_mm512_permutex2var_pd(__A, __I, __B), - (__v8df)_mm512_setzero_pd()); -} - -static __inline __m512 __DEFAULT_FN_ATTRS512 -_mm512_permutex2var_ps(__m512 __A, __m512i __I, __m512 __B) -{ - return (__m512)__builtin_ia32_vpermi2varps512((__v16sf)__A, (__v16si)__I, - (__v16sf) __B); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_permutex2var_ps(__m512 __A, __mmask16 __U, __m512i __I, __m512 __B) -{ - return (__m512)__builtin_ia32_selectps_512(__U, - (__v16sf)_mm512_permutex2var_ps(__A, __I, __B), - (__v16sf)__A); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask2_permutex2var_ps(__m512 __A, __m512i __I, __mmask16 __U, __m512 __B) -{ - return (__m512)__builtin_ia32_selectps_512(__U, - (__v16sf)_mm512_permutex2var_ps(__A, __I, __B), - (__v16sf)(__m512)__I); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_maskz_permutex2var_ps(__mmask16 __U, __m512 __A, __m512i __I, __m512 __B) -{ - return (__m512)__builtin_ia32_selectps_512(__U, - (__v16sf)_mm512_permutex2var_ps(__A, __I, __B), - (__v16sf)_mm512_setzero_ps()); -} - - -#define _mm512_cvtt_roundpd_epu32(A, R) \ - ((__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \ - (__v8si)_mm256_undefined_si256(), \ - (__mmask8)-1, (int)(R))) - -#define _mm512_mask_cvtt_roundpd_epu32(W, U, A, R) \ - ((__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \ - (__v8si)(__m256i)(W), \ - (__mmask8)(U), (int)(R))) - -#define _mm512_maskz_cvtt_roundpd_epu32(U, A, R) \ - ((__m256i)__builtin_ia32_cvttpd2udq512_mask((__v8df)(__m512d)(A), \ - (__v8si)_mm256_setzero_si256(), \ - (__mmask8)(U), (int)(R))) - -static __inline__ __m256i __DEFAULT_FN_ATTRS512 -_mm512_cvttpd_epu32 (__m512d __A) -{ - return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A, - (__v8si) - _mm256_undefined_si256 (), - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvttpd_epu32 (__m256i __W, __mmask8 __U, __m512d __A) -{ - return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A, - (__v8si) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvttpd_epu32 (__mmask8 __U, __m512d __A) -{ - return (__m256i) __builtin_ia32_cvttpd2udq512_mask ((__v8df) __A, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_roundscale_round_sd(A, B, imm, R) \ - ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1, (int)(imm), \ - (int)(R))) - -#define _mm_roundscale_sd(A, B, imm) \ - ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1, (int)(imm), \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm_mask_roundscale_sd(W, U, A, B, imm) \ - ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)(__m128d)(W), \ - (__mmask8)(U), (int)(imm), \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm_mask_roundscale_round_sd(W, U, A, B, I, R) \ - ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)(__m128d)(W), \ - (__mmask8)(U), (int)(I), \ - (int)(R))) - -#define _mm_maskz_roundscale_sd(U, A, B, I) \ - ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U), (int)(I), \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm_maskz_roundscale_round_sd(U, A, B, I, R) \ - ((__m128d)__builtin_ia32_rndscalesd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U), (int)(I), \ - (int)(R))) - -#define _mm_roundscale_round_ss(A, B, imm, R) \ - ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)-1, (int)(imm), \ - (int)(R))) - -#define _mm_roundscale_ss(A, B, imm) \ - ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)-1, (int)(imm), \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm_mask_roundscale_ss(W, U, A, B, I) \ - ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)(__m128)(W), \ - (__mmask8)(U), (int)(I), \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm_mask_roundscale_round_ss(W, U, A, B, I, R) \ - ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)(__m128)(W), \ - (__mmask8)(U), (int)(I), \ - (int)(R))) - -#define _mm_maskz_roundscale_ss(U, A, B, I) \ - ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U), (int)(I), \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm_maskz_roundscale_round_ss(U, A, B, I, R) \ - ((__m128)__builtin_ia32_rndscaless_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U), (int)(I), \ - (int)(R))) - -#define _mm512_scalef_round_pd(A, B, R) \ - ((__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)_mm512_undefined_pd(), \ - (__mmask8)-1, (int)(R))) - -#define _mm512_mask_scalef_round_pd(W, U, A, B, R) \ - ((__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)(__m512d)(W), \ - (__mmask8)(U), (int)(R))) - -#define _mm512_maskz_scalef_round_pd(U, A, B, R) \ - ((__m512d)__builtin_ia32_scalefpd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)(U), (int)(R))) - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_scalef_pd (__m512d __A, __m512d __B) -{ - return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) - _mm512_undefined_pd (), - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_scalef_pd (__m512d __W, __mmask8 __U, __m512d __A, __m512d __B) -{ - return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_maskz_scalef_pd (__mmask8 __U, __m512d __A, __m512d __B) -{ - return (__m512d) __builtin_ia32_scalefpd512_mask ((__v8df) __A, - (__v8df) __B, - (__v8df) - _mm512_setzero_pd (), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_scalef_round_ps(A, B, R) \ - ((__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)_mm512_undefined_ps(), \ - (__mmask16)-1, (int)(R))) - -#define _mm512_mask_scalef_round_ps(W, U, A, B, R) \ - ((__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)(__m512)(W), \ - (__mmask16)(U), (int)(R))) - -#define _mm512_maskz_scalef_round_ps(U, A, B, R) \ - ((__m512)__builtin_ia32_scalefps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)(U), (int)(R))) - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_scalef_ps (__m512 __A, __m512 __B) -{ - return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) - _mm512_undefined_ps (), - (__mmask16) -1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_scalef_ps (__m512 __W, __mmask16 __U, __m512 __A, __m512 __B) -{ - return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) __W, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_maskz_scalef_ps (__mmask16 __U, __m512 __A, __m512 __B) -{ - return (__m512) __builtin_ia32_scalefps512_mask ((__v16sf) __A, - (__v16sf) __B, - (__v16sf) - _mm512_setzero_ps (), - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_scalef_round_sd(A, B, R) \ - ((__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1, (int)(R))) - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_scalef_sd (__m128d __A, __m128d __B) -{ - return (__m128d) __builtin_ia32_scalefsd_round_mask ((__v2df) __A, - (__v2df)( __B), (__v2df) _mm_setzero_pd(), - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_scalef_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) -{ - return (__m128d) __builtin_ia32_scalefsd_round_mask ( (__v2df) __A, - (__v2df) __B, - (__v2df) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_mask_scalef_round_sd(W, U, A, B, R) \ - ((__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)(__m128d)(W), \ - (__mmask8)(U), (int)(R))) - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_scalef_sd (__mmask8 __U, __m128d __A, __m128d __B) -{ - return (__m128d) __builtin_ia32_scalefsd_round_mask ( (__v2df) __A, - (__v2df) __B, - (__v2df) _mm_setzero_pd (), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_maskz_scalef_round_sd(U, A, B, R) \ - ((__m128d)__builtin_ia32_scalefsd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U), (int)(R))) - -#define _mm_scalef_round_ss(A, B, R) \ - ((__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)-1, (int)(R))) - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_scalef_ss (__m128 __A, __m128 __B) -{ - return (__m128) __builtin_ia32_scalefss_round_mask ((__v4sf) __A, - (__v4sf)( __B), (__v4sf) _mm_setzero_ps(), - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_scalef_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) -{ - return (__m128) __builtin_ia32_scalefss_round_mask ( (__v4sf) __A, - (__v4sf) __B, - (__v4sf) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_mask_scalef_round_ss(W, U, A, B, R) \ - ((__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)(__m128)(W), \ - (__mmask8)(U), (int)(R))) - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_scalef_ss (__mmask8 __U, __m128 __A, __m128 __B) -{ - return (__m128) __builtin_ia32_scalefss_round_mask ( (__v4sf) __A, - (__v4sf) __B, - (__v4sf) _mm_setzero_ps (), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_maskz_scalef_round_ss(U, A, B, R) \ - ((__m128)__builtin_ia32_scalefss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U), \ - (int)(R))) - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_srai_epi32(__m512i __A, unsigned int __B) -{ - return (__m512i)__builtin_ia32_psradi512((__v16si)__A, __B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_srai_epi32(__m512i __W, __mmask16 __U, __m512i __A, - unsigned int __B) -{ - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_srai_epi32(__A, __B), - (__v16si)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_srai_epi32(__mmask16 __U, __m512i __A, - unsigned int __B) { - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__U, - (__v16si)_mm512_srai_epi32(__A, __B), - (__v16si)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_srai_epi64(__m512i __A, unsigned int __B) -{ - return (__m512i)__builtin_ia32_psraqi512((__v8di)__A, __B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_srai_epi64(__m512i __W, __mmask8 __U, __m512i __A, unsigned int __B) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_srai_epi64(__A, __B), - (__v8di)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_srai_epi64(__mmask8 __U, __m512i __A, unsigned int __B) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__U, - (__v8di)_mm512_srai_epi64(__A, __B), - (__v8di)_mm512_setzero_si512()); -} - -#define _mm512_shuffle_f32x4(A, B, imm) \ - ((__m512)__builtin_ia32_shuf_f32x4((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), (int)(imm))) - -#define _mm512_mask_shuffle_f32x4(W, U, A, B, imm) \ - ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - (__v16sf)_mm512_shuffle_f32x4((A), (B), (imm)), \ - (__v16sf)(__m512)(W))) - -#define _mm512_maskz_shuffle_f32x4(U, A, B, imm) \ - ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - (__v16sf)_mm512_shuffle_f32x4((A), (B), (imm)), \ - (__v16sf)_mm512_setzero_ps())) - -#define _mm512_shuffle_f64x2(A, B, imm) \ - ((__m512d)__builtin_ia32_shuf_f64x2((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), (int)(imm))) - -#define _mm512_mask_shuffle_f64x2(W, U, A, B, imm) \ - ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_shuffle_f64x2((A), (B), (imm)), \ - (__v8df)(__m512d)(W))) - -#define _mm512_maskz_shuffle_f64x2(U, A, B, imm) \ - ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_shuffle_f64x2((A), (B), (imm)), \ - (__v8df)_mm512_setzero_pd())) - -#define _mm512_shuffle_i32x4(A, B, imm) \ - ((__m512i)__builtin_ia32_shuf_i32x4((__v16si)(__m512i)(A), \ - (__v16si)(__m512i)(B), (int)(imm))) - -#define _mm512_mask_shuffle_i32x4(W, U, A, B, imm) \ - ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ - (__v16si)_mm512_shuffle_i32x4((A), (B), (imm)), \ - (__v16si)(__m512i)(W))) - -#define _mm512_maskz_shuffle_i32x4(U, A, B, imm) \ - ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ - (__v16si)_mm512_shuffle_i32x4((A), (B), (imm)), \ - (__v16si)_mm512_setzero_si512())) - -#define _mm512_shuffle_i64x2(A, B, imm) \ - ((__m512i)__builtin_ia32_shuf_i64x2((__v8di)(__m512i)(A), \ - (__v8di)(__m512i)(B), (int)(imm))) - -#define _mm512_mask_shuffle_i64x2(W, U, A, B, imm) \ - ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ - (__v8di)_mm512_shuffle_i64x2((A), (B), (imm)), \ - (__v8di)(__m512i)(W))) - -#define _mm512_maskz_shuffle_i64x2(U, A, B, imm) \ - ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ - (__v8di)_mm512_shuffle_i64x2((A), (B), (imm)), \ - (__v8di)_mm512_setzero_si512())) - -#define _mm512_shuffle_pd(A, B, M) \ - ((__m512d)__builtin_ia32_shufpd512((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(B), (int)(M))) - -#define _mm512_mask_shuffle_pd(W, U, A, B, M) \ - ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_shuffle_pd((A), (B), (M)), \ - (__v8df)(__m512d)(W))) - -#define _mm512_maskz_shuffle_pd(U, A, B, M) \ - ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_shuffle_pd((A), (B), (M)), \ - (__v8df)_mm512_setzero_pd())) - -#define _mm512_shuffle_ps(A, B, M) \ - ((__m512)__builtin_ia32_shufps512((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(B), (int)(M))) - -#define _mm512_mask_shuffle_ps(W, U, A, B, M) \ - ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - (__v16sf)_mm512_shuffle_ps((A), (B), (M)), \ - (__v16sf)(__m512)(W))) - -#define _mm512_maskz_shuffle_ps(U, A, B, M) \ - ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - (__v16sf)_mm512_shuffle_ps((A), (B), (M)), \ - (__v16sf)_mm512_setzero_ps())) - -#define _mm_sqrt_round_sd(A, B, R) \ - ((__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1, (int)(R))) - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_sqrt_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) -{ - return (__m128d) __builtin_ia32_sqrtsd_round_mask ( (__v2df) __A, - (__v2df) __B, - (__v2df) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_mask_sqrt_round_sd(W, U, A, B, R) \ - ((__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)(__m128d)(W), \ - (__mmask8)(U), (int)(R))) - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_sqrt_sd (__mmask8 __U, __m128d __A, __m128d __B) -{ - return (__m128d) __builtin_ia32_sqrtsd_round_mask ( (__v2df) __A, - (__v2df) __B, - (__v2df) _mm_setzero_pd (), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_maskz_sqrt_round_sd(U, A, B, R) \ - ((__m128d)__builtin_ia32_sqrtsd_round_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U), (int)(R))) - -#define _mm_sqrt_round_ss(A, B, R) \ - ((__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)-1, (int)(R))) - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_sqrt_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) -{ - return (__m128) __builtin_ia32_sqrtss_round_mask ( (__v4sf) __A, - (__v4sf) __B, - (__v4sf) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_mask_sqrt_round_ss(W, U, A, B, R) \ - ((__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)(__m128)(W), (__mmask8)(U), \ - (int)(R))) - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_sqrt_ss (__mmask8 __U, __m128 __A, __m128 __B) -{ - return (__m128) __builtin_ia32_sqrtss_round_mask ( (__v4sf) __A, - (__v4sf) __B, - (__v4sf) _mm_setzero_ps (), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_maskz_sqrt_round_ss(U, A, B, R) \ - ((__m128)__builtin_ia32_sqrtss_round_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U), (int)(R))) - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_broadcast_f32x4(__m128 __A) -{ - return (__m512)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A, - 0, 1, 2, 3, 0, 1, 2, 3, - 0, 1, 2, 3, 0, 1, 2, 3); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_broadcast_f32x4(__m512 __O, __mmask16 __M, __m128 __A) -{ - return (__m512)__builtin_ia32_selectps_512((__mmask16)__M, - (__v16sf)_mm512_broadcast_f32x4(__A), - (__v16sf)__O); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_maskz_broadcast_f32x4(__mmask16 __M, __m128 __A) -{ - return (__m512)__builtin_ia32_selectps_512((__mmask16)__M, - (__v16sf)_mm512_broadcast_f32x4(__A), - (__v16sf)_mm512_setzero_ps()); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_broadcast_f64x4(__m256d __A) -{ - return (__m512d)__builtin_shufflevector((__v4df)__A, (__v4df)__A, - 0, 1, 2, 3, 0, 1, 2, 3); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_broadcast_f64x4(__m512d __O, __mmask8 __M, __m256d __A) -{ - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M, - (__v8df)_mm512_broadcast_f64x4(__A), - (__v8df)__O); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_maskz_broadcast_f64x4(__mmask8 __M, __m256d __A) -{ - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__M, - (__v8df)_mm512_broadcast_f64x4(__A), - (__v8df)_mm512_setzero_pd()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_broadcast_i32x4(__m128i __A) -{ - return (__m512i)__builtin_shufflevector((__v4si)__A, (__v4si)__A, - 0, 1, 2, 3, 0, 1, 2, 3, - 0, 1, 2, 3, 0, 1, 2, 3); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_broadcast_i32x4(__m512i __O, __mmask16 __M, __m128i __A) -{ - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, - (__v16si)_mm512_broadcast_i32x4(__A), - (__v16si)__O); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_broadcast_i32x4(__mmask16 __M, __m128i __A) -{ - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, - (__v16si)_mm512_broadcast_i32x4(__A), - (__v16si)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_broadcast_i64x4(__m256i __A) -{ - return (__m512i)__builtin_shufflevector((__v4di)__A, (__v4di)__A, - 0, 1, 2, 3, 0, 1, 2, 3); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_broadcast_i64x4(__m512i __O, __mmask8 __M, __m256i __A) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, - (__v8di)_mm512_broadcast_i64x4(__A), - (__v8di)__O); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_broadcast_i64x4(__mmask8 __M, __m256i __A) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, - (__v8di)_mm512_broadcast_i64x4(__A), - (__v8di)_mm512_setzero_si512()); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_broadcastsd_pd (__m512d __O, __mmask8 __M, __m128d __A) -{ - return (__m512d)__builtin_ia32_selectpd_512(__M, - (__v8df) _mm512_broadcastsd_pd(__A), - (__v8df) __O); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A) -{ - return (__m512d)__builtin_ia32_selectpd_512(__M, - (__v8df) _mm512_broadcastsd_pd(__A), - (__v8df) _mm512_setzero_pd()); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_broadcastss_ps (__m512 __O, __mmask16 __M, __m128 __A) -{ - return (__m512)__builtin_ia32_selectps_512(__M, - (__v16sf) _mm512_broadcastss_ps(__A), - (__v16sf) __O); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_maskz_broadcastss_ps (__mmask16 __M, __m128 __A) -{ - return (__m512)__builtin_ia32_selectps_512(__M, - (__v16sf) _mm512_broadcastss_ps(__A), - (__v16sf) _mm512_setzero_ps()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS512 -_mm512_cvtsepi32_epi8 (__m512i __A) -{ - return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A, - (__v16qi) _mm_undefined_si128 (), - (__mmask16) -1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtsepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A) -{ - return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A, - (__v16qi) __O, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtsepi32_epi8 (__mmask16 __M, __m512i __A) -{ - return (__m128i) __builtin_ia32_pmovsdb512_mask ((__v16si) __A, - (__v16qi) _mm_setzero_si128 (), - __M); -} - -static __inline__ void __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A) -{ - __builtin_ia32_pmovsdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS512 -_mm512_cvtsepi32_epi16 (__m512i __A) -{ - return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A, - (__v16hi) _mm256_undefined_si256 (), - (__mmask16) -1); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtsepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A) -{ - return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A, - (__v16hi) __O, __M); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtsepi32_epi16 (__mmask16 __M, __m512i __A) -{ - return (__m256i) __builtin_ia32_pmovsdw512_mask ((__v16si) __A, - (__v16hi) _mm256_setzero_si256 (), - __M); -} - -static __inline__ void __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtsepi32_storeu_epi16 (void *__P, __mmask16 __M, __m512i __A) -{ - __builtin_ia32_pmovsdw512mem_mask ((__v16hi*) __P, (__v16si) __A, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS512 -_mm512_cvtsepi64_epi8 (__m512i __A) -{ - return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A, - (__v16qi) _mm_undefined_si128 (), - (__mmask8) -1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A) -{ - return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A, - (__v16qi) __O, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtsepi64_epi8 (__mmask8 __M, __m512i __A) -{ - return (__m128i) __builtin_ia32_pmovsqb512_mask ((__v8di) __A, - (__v16qi) _mm_setzero_si128 (), - __M); -} - -static __inline__ void __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A) -{ - __builtin_ia32_pmovsqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS512 -_mm512_cvtsepi64_epi32 (__m512i __A) -{ - return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A, - (__v8si) _mm256_undefined_si256 (), - (__mmask8) -1); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtsepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A) -{ - return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A, - (__v8si) __O, __M); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtsepi64_epi32 (__mmask8 __M, __m512i __A) -{ - return (__m256i) __builtin_ia32_pmovsqd512_mask ((__v8di) __A, - (__v8si) _mm256_setzero_si256 (), - __M); -} - -static __inline__ void __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtsepi64_storeu_epi32 (void *__P, __mmask8 __M, __m512i __A) -{ - __builtin_ia32_pmovsqd512mem_mask ((__v8si *) __P, (__v8di) __A, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS512 -_mm512_cvtsepi64_epi16 (__m512i __A) -{ - return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A, - (__v8hi) _mm_undefined_si128 (), - (__mmask8) -1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A) -{ - return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A, - (__v8hi) __O, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtsepi64_epi16 (__mmask8 __M, __m512i __A) -{ - return (__m128i) __builtin_ia32_pmovsqw512_mask ((__v8di) __A, - (__v8hi) _mm_setzero_si128 (), - __M); -} - -static __inline__ void __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m512i __A) -{ - __builtin_ia32_pmovsqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS512 -_mm512_cvtusepi32_epi8 (__m512i __A) -{ - return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A, - (__v16qi) _mm_undefined_si128 (), - (__mmask16) -1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtusepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A) -{ - return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A, - (__v16qi) __O, - __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtusepi32_epi8 (__mmask16 __M, __m512i __A) -{ - return (__m128i) __builtin_ia32_pmovusdb512_mask ((__v16si) __A, - (__v16qi) _mm_setzero_si128 (), - __M); -} - -static __inline__ void __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A) -{ - __builtin_ia32_pmovusdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS512 -_mm512_cvtusepi32_epi16 (__m512i __A) -{ - return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A, - (__v16hi) _mm256_undefined_si256 (), - (__mmask16) -1); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtusepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A) -{ - return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A, - (__v16hi) __O, - __M); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtusepi32_epi16 (__mmask16 __M, __m512i __A) -{ - return (__m256i) __builtin_ia32_pmovusdw512_mask ((__v16si) __A, - (__v16hi) _mm256_setzero_si256 (), - __M); -} - -static __inline__ void __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtusepi32_storeu_epi16 (void *__P, __mmask16 __M, __m512i __A) -{ - __builtin_ia32_pmovusdw512mem_mask ((__v16hi*) __P, (__v16si) __A, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS512 -_mm512_cvtusepi64_epi8 (__m512i __A) -{ - return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A, - (__v16qi) _mm_undefined_si128 (), - (__mmask8) -1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A) -{ - return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A, - (__v16qi) __O, - __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtusepi64_epi8 (__mmask8 __M, __m512i __A) -{ - return (__m128i) __builtin_ia32_pmovusqb512_mask ((__v8di) __A, - (__v16qi) _mm_setzero_si128 (), - __M); -} - -static __inline__ void __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A) -{ - __builtin_ia32_pmovusqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS512 -_mm512_cvtusepi64_epi32 (__m512i __A) -{ - return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A, - (__v8si) _mm256_undefined_si256 (), - (__mmask8) -1); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtusepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A) -{ - return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A, - (__v8si) __O, __M); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtusepi64_epi32 (__mmask8 __M, __m512i __A) -{ - return (__m256i) __builtin_ia32_pmovusqd512_mask ((__v8di) __A, - (__v8si) _mm256_setzero_si256 (), - __M); -} - -static __inline__ void __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtusepi64_storeu_epi32 (void* __P, __mmask8 __M, __m512i __A) -{ - __builtin_ia32_pmovusqd512mem_mask ((__v8si*) __P, (__v8di) __A, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS512 -_mm512_cvtusepi64_epi16 (__m512i __A) -{ - return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A, - (__v8hi) _mm_undefined_si128 (), - (__mmask8) -1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A) -{ - return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A, - (__v8hi) __O, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtusepi64_epi16 (__mmask8 __M, __m512i __A) -{ - return (__m128i) __builtin_ia32_pmovusqw512_mask ((__v8di) __A, - (__v8hi) _mm_setzero_si128 (), - __M); -} - -static __inline__ void __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtusepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A) -{ - __builtin_ia32_pmovusqw512mem_mask ((__v8hi*) __P, (__v8di) __A, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS512 -_mm512_cvtepi32_epi8 (__m512i __A) -{ - return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A, - (__v16qi) _mm_undefined_si128 (), - (__mmask16) -1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtepi32_epi8 (__m128i __O, __mmask16 __M, __m512i __A) -{ - return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A, - (__v16qi) __O, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtepi32_epi8 (__mmask16 __M, __m512i __A) -{ - return (__m128i) __builtin_ia32_pmovdb512_mask ((__v16si) __A, - (__v16qi) _mm_setzero_si128 (), - __M); -} - -static __inline__ void __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtepi32_storeu_epi8 (void * __P, __mmask16 __M, __m512i __A) -{ - __builtin_ia32_pmovdb512mem_mask ((__v16qi *) __P, (__v16si) __A, __M); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS512 -_mm512_cvtepi32_epi16 (__m512i __A) -{ - return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A, - (__v16hi) _mm256_undefined_si256 (), - (__mmask16) -1); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtepi32_epi16 (__m256i __O, __mmask16 __M, __m512i __A) -{ - return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A, - (__v16hi) __O, __M); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtepi32_epi16 (__mmask16 __M, __m512i __A) -{ - return (__m256i) __builtin_ia32_pmovdw512_mask ((__v16si) __A, - (__v16hi) _mm256_setzero_si256 (), - __M); -} - -static __inline__ void __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtepi32_storeu_epi16 (void * __P, __mmask16 __M, __m512i __A) -{ - __builtin_ia32_pmovdw512mem_mask ((__v16hi *) __P, (__v16si) __A, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS512 -_mm512_cvtepi64_epi8 (__m512i __A) -{ - return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A, - (__v16qi) _mm_undefined_si128 (), - (__mmask8) -1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m512i __A) -{ - return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A, - (__v16qi) __O, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtepi64_epi8 (__mmask8 __M, __m512i __A) -{ - return (__m128i) __builtin_ia32_pmovqb512_mask ((__v8di) __A, - (__v16qi) _mm_setzero_si128 (), - __M); -} - -static __inline__ void __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m512i __A) -{ - __builtin_ia32_pmovqb512mem_mask ((__v16qi *) __P, (__v8di) __A, __M); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS512 -_mm512_cvtepi64_epi32 (__m512i __A) -{ - return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A, - (__v8si) _mm256_undefined_si256 (), - (__mmask8) -1); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtepi64_epi32 (__m256i __O, __mmask8 __M, __m512i __A) -{ - return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A, - (__v8si) __O, __M); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtepi64_epi32 (__mmask8 __M, __m512i __A) -{ - return (__m256i) __builtin_ia32_pmovqd512_mask ((__v8di) __A, - (__v8si) _mm256_setzero_si256 (), - __M); -} - -static __inline__ void __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtepi64_storeu_epi32 (void* __P, __mmask8 __M, __m512i __A) -{ - __builtin_ia32_pmovqd512mem_mask ((__v8si *) __P, (__v8di) __A, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS512 -_mm512_cvtepi64_epi16 (__m512i __A) -{ - return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A, - (__v8hi) _mm_undefined_si128 (), - (__mmask8) -1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m512i __A) -{ - return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A, - (__v8hi) __O, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtepi64_epi16 (__mmask8 __M, __m512i __A) -{ - return (__m128i) __builtin_ia32_pmovqw512_mask ((__v8di) __A, - (__v8hi) _mm_setzero_si128 (), - __M); -} - -static __inline__ void __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtepi64_storeu_epi16 (void *__P, __mmask8 __M, __m512i __A) -{ - __builtin_ia32_pmovqw512mem_mask ((__v8hi *) __P, (__v8di) __A, __M); -} - -#define _mm512_extracti32x4_epi32(A, imm) \ - ((__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \ - (__v4si)_mm_undefined_si128(), \ - (__mmask8)-1)) - -#define _mm512_mask_extracti32x4_epi32(W, U, A, imm) \ - ((__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \ - (__v4si)(__m128i)(W), \ - (__mmask8)(U))) - -#define _mm512_maskz_extracti32x4_epi32(U, A, imm) \ - ((__m128i)__builtin_ia32_extracti32x4_mask((__v16si)(__m512i)(A), (int)(imm), \ - (__v4si)_mm_setzero_si128(), \ - (__mmask8)(U))) - -#define _mm512_extracti64x4_epi64(A, imm) \ - ((__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \ - (__v4di)_mm256_undefined_si256(), \ - (__mmask8)-1)) - -#define _mm512_mask_extracti64x4_epi64(W, U, A, imm) \ - ((__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \ - (__v4di)(__m256i)(W), \ - (__mmask8)(U))) - -#define _mm512_maskz_extracti64x4_epi64(U, A, imm) \ - ((__m256i)__builtin_ia32_extracti64x4_mask((__v8di)(__m512i)(A), (int)(imm), \ - (__v4di)_mm256_setzero_si256(), \ - (__mmask8)(U))) - -#define _mm512_insertf64x4(A, B, imm) \ - ((__m512d)__builtin_ia32_insertf64x4((__v8df)(__m512d)(A), \ - (__v4df)(__m256d)(B), (int)(imm))) - -#define _mm512_mask_insertf64x4(W, U, A, B, imm) \ - ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_insertf64x4((A), (B), (imm)), \ - (__v8df)(__m512d)(W))) - -#define _mm512_maskz_insertf64x4(U, A, B, imm) \ - ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_insertf64x4((A), (B), (imm)), \ - (__v8df)_mm512_setzero_pd())) - -#define _mm512_inserti64x4(A, B, imm) \ - ((__m512i)__builtin_ia32_inserti64x4((__v8di)(__m512i)(A), \ - (__v4di)(__m256i)(B), (int)(imm))) - -#define _mm512_mask_inserti64x4(W, U, A, B, imm) \ - ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ - (__v8di)_mm512_inserti64x4((A), (B), (imm)), \ - (__v8di)(__m512i)(W))) - -#define _mm512_maskz_inserti64x4(U, A, B, imm) \ - ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ - (__v8di)_mm512_inserti64x4((A), (B), (imm)), \ - (__v8di)_mm512_setzero_si512())) - -#define _mm512_insertf32x4(A, B, imm) \ - ((__m512)__builtin_ia32_insertf32x4((__v16sf)(__m512)(A), \ - (__v4sf)(__m128)(B), (int)(imm))) - -#define _mm512_mask_insertf32x4(W, U, A, B, imm) \ - ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - (__v16sf)_mm512_insertf32x4((A), (B), (imm)), \ - (__v16sf)(__m512)(W))) - -#define _mm512_maskz_insertf32x4(U, A, B, imm) \ - ((__m512)__builtin_ia32_selectps_512((__mmask16)(U), \ - (__v16sf)_mm512_insertf32x4((A), (B), (imm)), \ - (__v16sf)_mm512_setzero_ps())) - -#define _mm512_inserti32x4(A, B, imm) \ - ((__m512i)__builtin_ia32_inserti32x4((__v16si)(__m512i)(A), \ - (__v4si)(__m128i)(B), (int)(imm))) - -#define _mm512_mask_inserti32x4(W, U, A, B, imm) \ - ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ - (__v16si)_mm512_inserti32x4((A), (B), (imm)), \ - (__v16si)(__m512i)(W))) - -#define _mm512_maskz_inserti32x4(U, A, B, imm) \ - ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ - (__v16si)_mm512_inserti32x4((A), (B), (imm)), \ - (__v16si)_mm512_setzero_si512())) - -#define _mm512_getmant_round_pd(A, B, C, R) \ - ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \ - (int)(((C)<<2) | (B)), \ - (__v8df)_mm512_undefined_pd(), \ - (__mmask8)-1, (int)(R))) - -#define _mm512_mask_getmant_round_pd(W, U, A, B, C, R) \ - ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \ - (int)(((C)<<2) | (B)), \ - (__v8df)(__m512d)(W), \ - (__mmask8)(U), (int)(R))) - -#define _mm512_maskz_getmant_round_pd(U, A, B, C, R) \ - ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \ - (int)(((C)<<2) | (B)), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)(U), (int)(R))) - -#define _mm512_getmant_pd(A, B, C) \ - ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \ - (int)(((C)<<2) | (B)), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)-1, \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_mask_getmant_pd(W, U, A, B, C) \ - ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \ - (int)(((C)<<2) | (B)), \ - (__v8df)(__m512d)(W), \ - (__mmask8)(U), \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_maskz_getmant_pd(U, A, B, C) \ - ((__m512d)__builtin_ia32_getmantpd512_mask((__v8df)(__m512d)(A), \ - (int)(((C)<<2) | (B)), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)(U), \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_getmant_round_ps(A, B, C, R) \ - ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \ - (int)(((C)<<2) | (B)), \ - (__v16sf)_mm512_undefined_ps(), \ - (__mmask16)-1, (int)(R))) - -#define _mm512_mask_getmant_round_ps(W, U, A, B, C, R) \ - ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \ - (int)(((C)<<2) | (B)), \ - (__v16sf)(__m512)(W), \ - (__mmask16)(U), (int)(R))) - -#define _mm512_maskz_getmant_round_ps(U, A, B, C, R) \ - ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \ - (int)(((C)<<2) | (B)), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)(U), (int)(R))) - -#define _mm512_getmant_ps(A, B, C) \ - ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \ - (int)(((C)<<2)|(B)), \ - (__v16sf)_mm512_undefined_ps(), \ - (__mmask16)-1, \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_mask_getmant_ps(W, U, A, B, C) \ - ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \ - (int)(((C)<<2)|(B)), \ - (__v16sf)(__m512)(W), \ - (__mmask16)(U), \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_maskz_getmant_ps(U, A, B, C) \ - ((__m512)__builtin_ia32_getmantps512_mask((__v16sf)(__m512)(A), \ - (int)(((C)<<2)|(B)), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)(U), \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_getexp_round_pd(A, R) \ - ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \ - (__v8df)_mm512_undefined_pd(), \ - (__mmask8)-1, (int)(R))) - -#define _mm512_mask_getexp_round_pd(W, U, A, R) \ - ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \ - (__v8df)(__m512d)(W), \ - (__mmask8)(U), (int)(R))) - -#define _mm512_maskz_getexp_round_pd(U, A, R) \ - ((__m512d)__builtin_ia32_getexppd512_mask((__v8df)(__m512d)(A), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)(U), (int)(R))) - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_getexp_pd (__m512d __A) -{ - return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A, - (__v8df) _mm512_undefined_pd (), - (__mmask8) -1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_getexp_pd (__m512d __W, __mmask8 __U, __m512d __A) -{ - return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A, - (__v8df) __W, - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_maskz_getexp_pd (__mmask8 __U, __m512d __A) -{ - return (__m512d) __builtin_ia32_getexppd512_mask ((__v8df) __A, - (__v8df) _mm512_setzero_pd (), - (__mmask8) __U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_getexp_round_ps(A, R) \ - ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)_mm512_undefined_ps(), \ - (__mmask16)-1, (int)(R))) - -#define _mm512_mask_getexp_round_ps(W, U, A, R) \ - ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)(__m512)(W), \ - (__mmask16)(U), (int)(R))) - -#define _mm512_maskz_getexp_round_ps(U, A, R) \ - ((__m512)__builtin_ia32_getexpps512_mask((__v16sf)(__m512)(A), \ - (__v16sf)_mm512_setzero_ps(), \ - (__mmask16)(U), (int)(R))) - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_getexp_ps (__m512 __A) -{ - return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A, - (__v16sf) _mm512_undefined_ps (), - (__mmask16) -1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_getexp_ps (__m512 __W, __mmask16 __U, __m512 __A) -{ - return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A, - (__v16sf) __W, - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_maskz_getexp_ps (__mmask16 __U, __m512 __A) -{ - return (__m512) __builtin_ia32_getexpps512_mask ((__v16sf) __A, - (__v16sf) _mm512_setzero_ps (), - (__mmask16) __U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_i64gather_ps(index, addr, scale) \ - ((__m256)__builtin_ia32_gatherdiv16sf((__v8sf)_mm256_undefined_ps(), \ - (void const *)(addr), \ - (__v8di)(__m512i)(index), (__mmask8)-1, \ - (int)(scale))) - -#define _mm512_mask_i64gather_ps(v1_old, mask, index, addr, scale) \ - ((__m256)__builtin_ia32_gatherdiv16sf((__v8sf)(__m256)(v1_old),\ - (void const *)(addr), \ - (__v8di)(__m512i)(index), \ - (__mmask8)(mask), (int)(scale))) - -#define _mm512_i64gather_epi32(index, addr, scale) \ - ((__m256i)__builtin_ia32_gatherdiv16si((__v8si)_mm256_undefined_si256(), \ - (void const *)(addr), \ - (__v8di)(__m512i)(index), \ - (__mmask8)-1, (int)(scale))) - -#define _mm512_mask_i64gather_epi32(v1_old, mask, index, addr, scale) \ - ((__m256i)__builtin_ia32_gatherdiv16si((__v8si)(__m256i)(v1_old), \ - (void const *)(addr), \ - (__v8di)(__m512i)(index), \ - (__mmask8)(mask), (int)(scale))) - -#define _mm512_i64gather_pd(index, addr, scale) \ - ((__m512d)__builtin_ia32_gatherdiv8df((__v8df)_mm512_undefined_pd(), \ - (void const *)(addr), \ - (__v8di)(__m512i)(index), (__mmask8)-1, \ - (int)(scale))) - -#define _mm512_mask_i64gather_pd(v1_old, mask, index, addr, scale) \ - ((__m512d)__builtin_ia32_gatherdiv8df((__v8df)(__m512d)(v1_old), \ - (void const *)(addr), \ - (__v8di)(__m512i)(index), \ - (__mmask8)(mask), (int)(scale))) - -#define _mm512_i64gather_epi64(index, addr, scale) \ - ((__m512i)__builtin_ia32_gatherdiv8di((__v8di)_mm512_undefined_epi32(), \ - (void const *)(addr), \ - (__v8di)(__m512i)(index), (__mmask8)-1, \ - (int)(scale))) - -#define _mm512_mask_i64gather_epi64(v1_old, mask, index, addr, scale) \ - ((__m512i)__builtin_ia32_gatherdiv8di((__v8di)(__m512i)(v1_old), \ - (void const *)(addr), \ - (__v8di)(__m512i)(index), \ - (__mmask8)(mask), (int)(scale))) - -#define _mm512_i32gather_ps(index, addr, scale) \ - ((__m512)__builtin_ia32_gathersiv16sf((__v16sf)_mm512_undefined_ps(), \ - (void const *)(addr), \ - (__v16si)(__m512)(index), \ - (__mmask16)-1, (int)(scale))) - -#define _mm512_mask_i32gather_ps(v1_old, mask, index, addr, scale) \ - ((__m512)__builtin_ia32_gathersiv16sf((__v16sf)(__m512)(v1_old), \ - (void const *)(addr), \ - (__v16si)(__m512)(index), \ - (__mmask16)(mask), (int)(scale))) - -#define _mm512_i32gather_epi32(index, addr, scale) \ - ((__m512i)__builtin_ia32_gathersiv16si((__v16si)_mm512_undefined_epi32(), \ - (void const *)(addr), \ - (__v16si)(__m512i)(index), \ - (__mmask16)-1, (int)(scale))) - -#define _mm512_mask_i32gather_epi32(v1_old, mask, index, addr, scale) \ - ((__m512i)__builtin_ia32_gathersiv16si((__v16si)(__m512i)(v1_old), \ - (void const *)(addr), \ - (__v16si)(__m512i)(index), \ - (__mmask16)(mask), (int)(scale))) - -#define _mm512_i32gather_pd(index, addr, scale) \ - ((__m512d)__builtin_ia32_gathersiv8df((__v8df)_mm512_undefined_pd(), \ - (void const *)(addr), \ - (__v8si)(__m256i)(index), (__mmask8)-1, \ - (int)(scale))) - -#define _mm512_mask_i32gather_pd(v1_old, mask, index, addr, scale) \ - ((__m512d)__builtin_ia32_gathersiv8df((__v8df)(__m512d)(v1_old), \ - (void const *)(addr), \ - (__v8si)(__m256i)(index), \ - (__mmask8)(mask), (int)(scale))) - -#define _mm512_i32gather_epi64(index, addr, scale) \ - ((__m512i)__builtin_ia32_gathersiv8di((__v8di)_mm512_undefined_epi32(), \ - (void const *)(addr), \ - (__v8si)(__m256i)(index), (__mmask8)-1, \ - (int)(scale))) - -#define _mm512_mask_i32gather_epi64(v1_old, mask, index, addr, scale) \ - ((__m512i)__builtin_ia32_gathersiv8di((__v8di)(__m512i)(v1_old), \ - (void const *)(addr), \ - (__v8si)(__m256i)(index), \ - (__mmask8)(mask), (int)(scale))) - -#define _mm512_i64scatter_ps(addr, index, v1, scale) \ - __builtin_ia32_scatterdiv16sf((void *)(addr), (__mmask8)-1, \ - (__v8di)(__m512i)(index), \ - (__v8sf)(__m256)(v1), (int)(scale)) - -#define _mm512_mask_i64scatter_ps(addr, mask, index, v1, scale) \ - __builtin_ia32_scatterdiv16sf((void *)(addr), (__mmask8)(mask), \ - (__v8di)(__m512i)(index), \ - (__v8sf)(__m256)(v1), (int)(scale)) - -#define _mm512_i64scatter_epi32(addr, index, v1, scale) \ - __builtin_ia32_scatterdiv16si((void *)(addr), (__mmask8)-1, \ - (__v8di)(__m512i)(index), \ - (__v8si)(__m256i)(v1), (int)(scale)) - -#define _mm512_mask_i64scatter_epi32(addr, mask, index, v1, scale) \ - __builtin_ia32_scatterdiv16si((void *)(addr), (__mmask8)(mask), \ - (__v8di)(__m512i)(index), \ - (__v8si)(__m256i)(v1), (int)(scale)) - -#define _mm512_i64scatter_pd(addr, index, v1, scale) \ - __builtin_ia32_scatterdiv8df((void *)(addr), (__mmask8)-1, \ - (__v8di)(__m512i)(index), \ - (__v8df)(__m512d)(v1), (int)(scale)) - -#define _mm512_mask_i64scatter_pd(addr, mask, index, v1, scale) \ - __builtin_ia32_scatterdiv8df((void *)(addr), (__mmask8)(mask), \ - (__v8di)(__m512i)(index), \ - (__v8df)(__m512d)(v1), (int)(scale)) - -#define _mm512_i64scatter_epi64(addr, index, v1, scale) \ - __builtin_ia32_scatterdiv8di((void *)(addr), (__mmask8)-1, \ - (__v8di)(__m512i)(index), \ - (__v8di)(__m512i)(v1), (int)(scale)) - -#define _mm512_mask_i64scatter_epi64(addr, mask, index, v1, scale) \ - __builtin_ia32_scatterdiv8di((void *)(addr), (__mmask8)(mask), \ - (__v8di)(__m512i)(index), \ - (__v8di)(__m512i)(v1), (int)(scale)) - -#define _mm512_i32scatter_ps(addr, index, v1, scale) \ - __builtin_ia32_scattersiv16sf((void *)(addr), (__mmask16)-1, \ - (__v16si)(__m512i)(index), \ - (__v16sf)(__m512)(v1), (int)(scale)) - -#define _mm512_mask_i32scatter_ps(addr, mask, index, v1, scale) \ - __builtin_ia32_scattersiv16sf((void *)(addr), (__mmask16)(mask), \ - (__v16si)(__m512i)(index), \ - (__v16sf)(__m512)(v1), (int)(scale)) - -#define _mm512_i32scatter_epi32(addr, index, v1, scale) \ - __builtin_ia32_scattersiv16si((void *)(addr), (__mmask16)-1, \ - (__v16si)(__m512i)(index), \ - (__v16si)(__m512i)(v1), (int)(scale)) - -#define _mm512_mask_i32scatter_epi32(addr, mask, index, v1, scale) \ - __builtin_ia32_scattersiv16si((void *)(addr), (__mmask16)(mask), \ - (__v16si)(__m512i)(index), \ - (__v16si)(__m512i)(v1), (int)(scale)) - -#define _mm512_i32scatter_pd(addr, index, v1, scale) \ - __builtin_ia32_scattersiv8df((void *)(addr), (__mmask8)-1, \ - (__v8si)(__m256i)(index), \ - (__v8df)(__m512d)(v1), (int)(scale)) - -#define _mm512_mask_i32scatter_pd(addr, mask, index, v1, scale) \ - __builtin_ia32_scattersiv8df((void *)(addr), (__mmask8)(mask), \ - (__v8si)(__m256i)(index), \ - (__v8df)(__m512d)(v1), (int)(scale)) - -#define _mm512_i32scatter_epi64(addr, index, v1, scale) \ - __builtin_ia32_scattersiv8di((void *)(addr), (__mmask8)-1, \ - (__v8si)(__m256i)(index), \ - (__v8di)(__m512i)(v1), (int)(scale)) - -#define _mm512_mask_i32scatter_epi64(addr, mask, index, v1, scale) \ - __builtin_ia32_scattersiv8di((void *)(addr), (__mmask8)(mask), \ - (__v8si)(__m256i)(index), \ - (__v8di)(__m512i)(v1), (int)(scale)) - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_fmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) -{ - return __builtin_ia32_vfmaddss3_mask((__v4sf)__W, - (__v4sf)__A, - (__v4sf)__B, - (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_fmadd_round_ss(A, B, C, R) \ - ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)(__m128)(C), (__mmask8)-1, \ - (int)(R))) - -#define _mm_mask_fmadd_round_ss(W, U, A, B, R) \ - ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \ - (__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), (__mmask8)(U), \ - (int)(R))) - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_fmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) -{ - return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A, - (__v4sf)__B, - (__v4sf)__C, - (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_maskz_fmadd_round_ss(U, A, B, C, R) \ - ((__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4sf)(__m128)(C), (__mmask8)(U), \ - (int)(R))) - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask3_fmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U) -{ - return __builtin_ia32_vfmaddss3_mask3((__v4sf)__W, - (__v4sf)__X, - (__v4sf)__Y, - (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_mask3_fmadd_round_ss(W, X, Y, U, R) \ - ((__m128)__builtin_ia32_vfmaddss3_mask3((__v4sf)(__m128)(W), \ - (__v4sf)(__m128)(X), \ - (__v4sf)(__m128)(Y), (__mmask8)(U), \ - (int)(R))) - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_fmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) -{ - return __builtin_ia32_vfmaddss3_mask((__v4sf)__W, - (__v4sf)__A, - -(__v4sf)__B, - (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_fmsub_round_ss(A, B, C, R) \ - ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - -(__v4sf)(__m128)(C), (__mmask8)-1, \ - (int)(R))) - -#define _mm_mask_fmsub_round_ss(W, U, A, B, R) \ - ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \ - (__v4sf)(__m128)(A), \ - -(__v4sf)(__m128)(B), (__mmask8)(U), \ - (int)(R))) - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_fmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) -{ - return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A, - (__v4sf)__B, - -(__v4sf)__C, - (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_maskz_fmsub_round_ss(U, A, B, C, R) \ - ((__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - -(__v4sf)(__m128)(C), (__mmask8)(U), \ - (int)(R))) - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask3_fmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U) -{ - return __builtin_ia32_vfmsubss3_mask3((__v4sf)__W, - (__v4sf)__X, - (__v4sf)__Y, - (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_mask3_fmsub_round_ss(W, X, Y, U, R) \ - ((__m128)__builtin_ia32_vfmsubss3_mask3((__v4sf)(__m128)(W), \ - (__v4sf)(__m128)(X), \ - (__v4sf)(__m128)(Y), (__mmask8)(U), \ - (int)(R))) - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_fnmadd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) -{ - return __builtin_ia32_vfmaddss3_mask((__v4sf)__W, - -(__v4sf)__A, - (__v4sf)__B, - (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_fnmadd_round_ss(A, B, C, R) \ - ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \ - -(__v4sf)(__m128)(B), \ - (__v4sf)(__m128)(C), (__mmask8)-1, \ - (int)(R))) - -#define _mm_mask_fnmadd_round_ss(W, U, A, B, R) \ - ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \ - -(__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), (__mmask8)(U), \ - (int)(R))) - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_fnmadd_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) -{ - return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A, - -(__v4sf)__B, - (__v4sf)__C, - (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_maskz_fnmadd_round_ss(U, A, B, C, R) \ - ((__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \ - -(__v4sf)(__m128)(B), \ - (__v4sf)(__m128)(C), (__mmask8)(U), \ - (int)(R))) - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask3_fnmadd_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U) -{ - return __builtin_ia32_vfmaddss3_mask3((__v4sf)__W, - -(__v4sf)__X, - (__v4sf)__Y, - (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_mask3_fnmadd_round_ss(W, X, Y, U, R) \ - ((__m128)__builtin_ia32_vfmaddss3_mask3((__v4sf)(__m128)(W), \ - -(__v4sf)(__m128)(X), \ - (__v4sf)(__m128)(Y), (__mmask8)(U), \ - (int)(R))) - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_fnmsub_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) -{ - return __builtin_ia32_vfmaddss3_mask((__v4sf)__W, - -(__v4sf)__A, - -(__v4sf)__B, - (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_fnmsub_round_ss(A, B, C, R) \ - ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(A), \ - -(__v4sf)(__m128)(B), \ - -(__v4sf)(__m128)(C), (__mmask8)-1, \ - (int)(R))) - -#define _mm_mask_fnmsub_round_ss(W, U, A, B, R) \ - ((__m128)__builtin_ia32_vfmaddss3_mask((__v4sf)(__m128)(W), \ - -(__v4sf)(__m128)(A), \ - -(__v4sf)(__m128)(B), (__mmask8)(U), \ - (int)(R))) - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_fnmsub_ss (__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) -{ - return __builtin_ia32_vfmaddss3_maskz((__v4sf)__A, - -(__v4sf)__B, - -(__v4sf)__C, - (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_maskz_fnmsub_round_ss(U, A, B, C, R) \ - ((__m128)__builtin_ia32_vfmaddss3_maskz((__v4sf)(__m128)(A), \ - -(__v4sf)(__m128)(B), \ - -(__v4sf)(__m128)(C), (__mmask8)(U), \ - (int)(R))) - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask3_fnmsub_ss (__m128 __W, __m128 __X, __m128 __Y, __mmask8 __U) -{ - return __builtin_ia32_vfmsubss3_mask3((__v4sf)__W, - -(__v4sf)__X, - (__v4sf)__Y, - (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_mask3_fnmsub_round_ss(W, X, Y, U, R) \ - ((__m128)__builtin_ia32_vfmsubss3_mask3((__v4sf)(__m128)(W), \ - -(__v4sf)(__m128)(X), \ - (__v4sf)(__m128)(Y), (__mmask8)(U), \ - (int)(R))) - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_fmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) -{ - return __builtin_ia32_vfmaddsd3_mask((__v2df)__W, - (__v2df)__A, - (__v2df)__B, - (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_fmadd_round_sd(A, B, C, R) \ - ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)(__m128d)(C), (__mmask8)-1, \ - (int)(R))) - -#define _mm_mask_fmadd_round_sd(W, U, A, B, R) \ - ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \ - (__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), (__mmask8)(U), \ - (int)(R))) - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_fmadd_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) -{ - return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A, - (__v2df)__B, - (__v2df)__C, - (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_maskz_fmadd_round_sd(U, A, B, C, R) \ - ((__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2df)(__m128d)(C), (__mmask8)(U), \ - (int)(R))) - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask3_fmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U) -{ - return __builtin_ia32_vfmaddsd3_mask3((__v2df)__W, - (__v2df)__X, - (__v2df)__Y, - (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_mask3_fmadd_round_sd(W, X, Y, U, R) \ - ((__m128d)__builtin_ia32_vfmaddsd3_mask3((__v2df)(__m128d)(W), \ - (__v2df)(__m128d)(X), \ - (__v2df)(__m128d)(Y), (__mmask8)(U), \ - (int)(R))) - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_fmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) -{ - return __builtin_ia32_vfmaddsd3_mask((__v2df)__W, - (__v2df)__A, - -(__v2df)__B, - (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_fmsub_round_sd(A, B, C, R) \ - ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - -(__v2df)(__m128d)(C), (__mmask8)-1, \ - (int)(R))) - -#define _mm_mask_fmsub_round_sd(W, U, A, B, R) \ - ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \ - (__v2df)(__m128d)(A), \ - -(__v2df)(__m128d)(B), (__mmask8)(U), \ - (int)(R))) - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_fmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) -{ - return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A, - (__v2df)__B, - -(__v2df)__C, - (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_maskz_fmsub_round_sd(U, A, B, C, R) \ - ((__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - -(__v2df)(__m128d)(C), \ - (__mmask8)(U), (int)(R))) - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask3_fmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U) -{ - return __builtin_ia32_vfmsubsd3_mask3((__v2df)__W, - (__v2df)__X, - (__v2df)__Y, - (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_mask3_fmsub_round_sd(W, X, Y, U, R) \ - ((__m128d)__builtin_ia32_vfmsubsd3_mask3((__v2df)(__m128d)(W), \ - (__v2df)(__m128d)(X), \ - (__v2df)(__m128d)(Y), \ - (__mmask8)(U), (int)(R))) - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_fnmadd_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) -{ - return __builtin_ia32_vfmaddsd3_mask((__v2df)__W, - -(__v2df)__A, - (__v2df)__B, - (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_fnmadd_round_sd(A, B, C, R) \ - ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \ - -(__v2df)(__m128d)(B), \ - (__v2df)(__m128d)(C), (__mmask8)-1, \ - (int)(R))) - -#define _mm_mask_fnmadd_round_sd(W, U, A, B, R) \ - ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \ - -(__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), (__mmask8)(U), \ - (int)(R))) - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_fnmadd_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) -{ - return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A, - -(__v2df)__B, - (__v2df)__C, - (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_maskz_fnmadd_round_sd(U, A, B, C, R) \ - ((__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \ - -(__v2df)(__m128d)(B), \ - (__v2df)(__m128d)(C), (__mmask8)(U), \ - (int)(R))) - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask3_fnmadd_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U) -{ - return __builtin_ia32_vfmaddsd3_mask3((__v2df)__W, - -(__v2df)__X, - (__v2df)__Y, - (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_mask3_fnmadd_round_sd(W, X, Y, U, R) \ - ((__m128d)__builtin_ia32_vfmaddsd3_mask3((__v2df)(__m128d)(W), \ - -(__v2df)(__m128d)(X), \ - (__v2df)(__m128d)(Y), (__mmask8)(U), \ - (int)(R))) - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_fnmsub_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) -{ - return __builtin_ia32_vfmaddsd3_mask((__v2df)__W, - -(__v2df)__A, - -(__v2df)__B, - (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_fnmsub_round_sd(A, B, C, R) \ - ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(A), \ - -(__v2df)(__m128d)(B), \ - -(__v2df)(__m128d)(C), (__mmask8)-1, \ - (int)(R))) - -#define _mm_mask_fnmsub_round_sd(W, U, A, B, R) \ - ((__m128d)__builtin_ia32_vfmaddsd3_mask((__v2df)(__m128d)(W), \ - -(__v2df)(__m128d)(A), \ - -(__v2df)(__m128d)(B), (__mmask8)(U), \ - (int)(R))) - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_fnmsub_sd (__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) -{ - return __builtin_ia32_vfmaddsd3_maskz((__v2df)__A, - -(__v2df)__B, - -(__v2df)__C, - (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_maskz_fnmsub_round_sd(U, A, B, C, R) \ - ((__m128d)__builtin_ia32_vfmaddsd3_maskz((__v2df)(__m128d)(A), \ - -(__v2df)(__m128d)(B), \ - -(__v2df)(__m128d)(C), \ - (__mmask8)(U), \ - (int)(R))) - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask3_fnmsub_sd (__m128d __W, __m128d __X, __m128d __Y, __mmask8 __U) -{ - return __builtin_ia32_vfmsubsd3_mask3((__v2df)__W, - -(__v2df)__X, - (__v2df)__Y, - (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_mask3_fnmsub_round_sd(W, X, Y, U, R) \ - ((__m128d)__builtin_ia32_vfmsubsd3_mask3((__v2df)(__m128d)(W), \ - -(__v2df)(__m128d)(X), \ - (__v2df)(__m128d)(Y), \ - (__mmask8)(U), (int)(R))) - -#define _mm512_permutex_pd(X, C) \ - ((__m512d)__builtin_ia32_permdf512((__v8df)(__m512d)(X), (int)(C))) - -#define _mm512_mask_permutex_pd(W, U, X, C) \ - ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_permutex_pd((X), (C)), \ - (__v8df)(__m512d)(W))) - -#define _mm512_maskz_permutex_pd(U, X, C) \ - ((__m512d)__builtin_ia32_selectpd_512((__mmask8)(U), \ - (__v8df)_mm512_permutex_pd((X), (C)), \ - (__v8df)_mm512_setzero_pd())) - -#define _mm512_permutex_epi64(X, C) \ - ((__m512i)__builtin_ia32_permdi512((__v8di)(__m512i)(X), (int)(C))) - -#define _mm512_mask_permutex_epi64(W, U, X, C) \ - ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ - (__v8di)_mm512_permutex_epi64((X), (C)), \ - (__v8di)(__m512i)(W))) - -#define _mm512_maskz_permutex_epi64(U, X, C) \ - ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ - (__v8di)_mm512_permutex_epi64((X), (C)), \ - (__v8di)_mm512_setzero_si512())) - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_permutexvar_pd (__m512i __X, __m512d __Y) -{ - return (__m512d)__builtin_ia32_permvardf512((__v8df) __Y, (__v8di) __X); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_permutexvar_pd (__m512d __W, __mmask8 __U, __m512i __X, __m512d __Y) -{ - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, - (__v8df)_mm512_permutexvar_pd(__X, __Y), - (__v8df)__W); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_maskz_permutexvar_pd (__mmask8 __U, __m512i __X, __m512d __Y) -{ - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, - (__v8df)_mm512_permutexvar_pd(__X, __Y), - (__v8df)_mm512_setzero_pd()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_permutexvar_epi64 (__m512i __X, __m512i __Y) -{ - return (__m512i)__builtin_ia32_permvardi512((__v8di)__Y, (__v8di)__X); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_permutexvar_epi64 (__mmask8 __M, __m512i __X, __m512i __Y) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, - (__v8di)_mm512_permutexvar_epi64(__X, __Y), - (__v8di)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_permutexvar_epi64 (__m512i __W, __mmask8 __M, __m512i __X, - __m512i __Y) -{ - return (__m512i)__builtin_ia32_selectq_512((__mmask8)__M, - (__v8di)_mm512_permutexvar_epi64(__X, __Y), - (__v8di)__W); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_permutexvar_ps (__m512i __X, __m512 __Y) -{ - return (__m512)__builtin_ia32_permvarsf512((__v16sf)__Y, (__v16si)__X); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_permutexvar_ps (__m512 __W, __mmask16 __U, __m512i __X, __m512 __Y) -{ - return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_permutexvar_ps(__X, __Y), - (__v16sf)__W); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_maskz_permutexvar_ps (__mmask16 __U, __m512i __X, __m512 __Y) -{ - return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_permutexvar_ps(__X, __Y), - (__v16sf)_mm512_setzero_ps()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_permutexvar_epi32 (__m512i __X, __m512i __Y) -{ - return (__m512i)__builtin_ia32_permvarsi512((__v16si)__Y, (__v16si)__X); -} - -#define _mm512_permutevar_epi32 _mm512_permutexvar_epi32 - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_permutexvar_epi32 (__mmask16 __M, __m512i __X, __m512i __Y) -{ - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, - (__v16si)_mm512_permutexvar_epi32(__X, __Y), - (__v16si)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_permutexvar_epi32 (__m512i __W, __mmask16 __M, __m512i __X, - __m512i __Y) -{ - return (__m512i)__builtin_ia32_selectd_512((__mmask16)__M, - (__v16si)_mm512_permutexvar_epi32(__X, __Y), - (__v16si)__W); -} - -#define _mm512_mask_permutevar_epi32 _mm512_mask_permutexvar_epi32 - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm512_kand (__mmask16 __A, __mmask16 __B) -{ - return (__mmask16) __builtin_ia32_kandhi ((__mmask16) __A, (__mmask16) __B); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm512_kandn (__mmask16 __A, __mmask16 __B) -{ - return (__mmask16) __builtin_ia32_kandnhi ((__mmask16) __A, (__mmask16) __B); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm512_kor (__mmask16 __A, __mmask16 __B) -{ - return (__mmask16) __builtin_ia32_korhi ((__mmask16) __A, (__mmask16) __B); -} - -static __inline__ int __DEFAULT_FN_ATTRS -_mm512_kortestc (__mmask16 __A, __mmask16 __B) -{ - return __builtin_ia32_kortestchi ((__mmask16) __A, (__mmask16) __B); -} - -static __inline__ int __DEFAULT_FN_ATTRS -_mm512_kortestz (__mmask16 __A, __mmask16 __B) -{ - return __builtin_ia32_kortestzhi ((__mmask16) __A, (__mmask16) __B); -} - -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_kortestc_mask16_u8(__mmask16 __A, __mmask16 __B) -{ - return (unsigned char)__builtin_ia32_kortestchi(__A, __B); -} - -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_kortestz_mask16_u8(__mmask16 __A, __mmask16 __B) -{ - return (unsigned char)__builtin_ia32_kortestzhi(__A, __B); -} - -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_kortest_mask16_u8(__mmask16 __A, __mmask16 __B, unsigned char *__C) { - *__C = (unsigned char)__builtin_ia32_kortestchi(__A, __B); - return (unsigned char)__builtin_ia32_kortestzhi(__A, __B); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm512_kunpackb (__mmask16 __A, __mmask16 __B) -{ - return (__mmask16) __builtin_ia32_kunpckhi ((__mmask16) __A, (__mmask16) __B); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm512_kxnor (__mmask16 __A, __mmask16 __B) -{ - return (__mmask16) __builtin_ia32_kxnorhi ((__mmask16) __A, (__mmask16) __B); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_mm512_kxor (__mmask16 __A, __mmask16 __B) -{ - return (__mmask16) __builtin_ia32_kxorhi ((__mmask16) __A, (__mmask16) __B); -} - -#define _kand_mask16 _mm512_kand -#define _kandn_mask16 _mm512_kandn -#define _knot_mask16 _mm512_knot -#define _kor_mask16 _mm512_kor -#define _kxnor_mask16 _mm512_kxnor -#define _kxor_mask16 _mm512_kxor - -#define _kshiftli_mask16(A, I) \ - ((__mmask16)__builtin_ia32_kshiftlihi((__mmask16)(A), (unsigned int)(I))) - -#define _kshiftri_mask16(A, I) \ - ((__mmask16)__builtin_ia32_kshiftrihi((__mmask16)(A), (unsigned int)(I))) - -static __inline__ unsigned int __DEFAULT_FN_ATTRS -_cvtmask16_u32(__mmask16 __A) { - return (unsigned int)__builtin_ia32_kmovw((__mmask16)__A); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_cvtu32_mask16(unsigned int __A) { - return (__mmask16)__builtin_ia32_kmovw((__mmask16)__A); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS -_load_mask16(__mmask16 *__A) { - return (__mmask16)__builtin_ia32_kmovw(*(__mmask16 *)__A); -} - -static __inline__ void __DEFAULT_FN_ATTRS -_store_mask16(__mmask16 *__A, __mmask16 __B) { - *(__mmask16 *)__A = __builtin_ia32_kmovw((__mmask16)__B); -} - -static __inline__ void __DEFAULT_FN_ATTRS512 -_mm512_stream_si512 (void * __P, __m512i __A) -{ - typedef __v8di __v8di_aligned __attribute__((aligned(64))); - __builtin_nontemporal_store((__v8di_aligned)__A, (__v8di_aligned*)__P); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_stream_load_si512 (void const *__P) -{ - typedef __v8di __v8di_aligned __attribute__((aligned(64))); - return (__m512i) __builtin_nontemporal_load((const __v8di_aligned *)__P); -} - -static __inline__ void __DEFAULT_FN_ATTRS512 -_mm512_stream_pd (void *__P, __m512d __A) -{ - typedef __v8df __v8df_aligned __attribute__((aligned(64))); - __builtin_nontemporal_store((__v8df_aligned)__A, (__v8df_aligned*)__P); -} - -static __inline__ void __DEFAULT_FN_ATTRS512 -_mm512_stream_ps (void *__P, __m512 __A) -{ - typedef __v16sf __v16sf_aligned __attribute__((aligned(64))); - __builtin_nontemporal_store((__v16sf_aligned)__A, (__v16sf_aligned*)__P); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_compress_pd (__m512d __W, __mmask8 __U, __m512d __A) -{ - return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A, - (__v8df) __W, - (__mmask8) __U); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_maskz_compress_pd (__mmask8 __U, __m512d __A) -{ - return (__m512d) __builtin_ia32_compressdf512_mask ((__v8df) __A, - (__v8df) - _mm512_setzero_pd (), - (__mmask8) __U); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_compress_epi64 (__m512i __W, __mmask8 __U, __m512i __A) -{ - return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A, - (__v8di) __W, - (__mmask8) __U); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_compress_epi64 (__mmask8 __U, __m512i __A) -{ - return (__m512i) __builtin_ia32_compressdi512_mask ((__v8di) __A, - (__v8di) - _mm512_setzero_si512 (), - (__mmask8) __U); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_compress_ps (__m512 __W, __mmask16 __U, __m512 __A) -{ - return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A, - (__v16sf) __W, - (__mmask16) __U); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_maskz_compress_ps (__mmask16 __U, __m512 __A) -{ - return (__m512) __builtin_ia32_compresssf512_mask ((__v16sf) __A, - (__v16sf) - _mm512_setzero_ps (), - (__mmask16) __U); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_compress_epi32 (__m512i __W, __mmask16 __U, __m512i __A) -{ - return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A, - (__v16si) __W, - (__mmask16) __U); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_compress_epi32 (__mmask16 __U, __m512i __A) -{ - return (__m512i) __builtin_ia32_compresssi512_mask ((__v16si) __A, - (__v16si) - _mm512_setzero_si512 (), - (__mmask16) __U); -} - -#define _mm_cmp_round_ss_mask(X, Y, P, R) \ - ((__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \ - (__v4sf)(__m128)(Y), (int)(P), \ - (__mmask8)-1, (int)(R))) - -#define _mm_mask_cmp_round_ss_mask(M, X, Y, P, R) \ - ((__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \ - (__v4sf)(__m128)(Y), (int)(P), \ - (__mmask8)(M), (int)(R))) - -#define _mm_cmp_ss_mask(X, Y, P) \ - ((__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \ - (__v4sf)(__m128)(Y), (int)(P), \ - (__mmask8)-1, \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm_mask_cmp_ss_mask(M, X, Y, P) \ - ((__mmask8)__builtin_ia32_cmpss_mask((__v4sf)(__m128)(X), \ - (__v4sf)(__m128)(Y), (int)(P), \ - (__mmask8)(M), \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm_cmp_round_sd_mask(X, Y, P, R) \ - ((__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \ - (__v2df)(__m128d)(Y), (int)(P), \ - (__mmask8)-1, (int)(R))) - -#define _mm_mask_cmp_round_sd_mask(M, X, Y, P, R) \ - ((__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \ - (__v2df)(__m128d)(Y), (int)(P), \ - (__mmask8)(M), (int)(R))) - -#define _mm_cmp_sd_mask(X, Y, P) \ - ((__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \ - (__v2df)(__m128d)(Y), (int)(P), \ - (__mmask8)-1, \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm_mask_cmp_sd_mask(M, X, Y, P) \ - ((__mmask8)__builtin_ia32_cmpsd_mask((__v2df)(__m128d)(X), \ - (__v2df)(__m128d)(Y), (int)(P), \ - (__mmask8)(M), \ - _MM_FROUND_CUR_DIRECTION)) - -/* Bit Test */ - -static __inline __mmask16 __DEFAULT_FN_ATTRS512 -_mm512_test_epi32_mask (__m512i __A, __m512i __B) -{ - return _mm512_cmpneq_epi32_mask (_mm512_and_epi32(__A, __B), - _mm512_setzero_si512()); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS512 -_mm512_mask_test_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B) -{ - return _mm512_mask_cmpneq_epi32_mask (__U, _mm512_and_epi32 (__A, __B), - _mm512_setzero_si512()); -} - -static __inline __mmask8 __DEFAULT_FN_ATTRS512 -_mm512_test_epi64_mask (__m512i __A, __m512i __B) -{ - return _mm512_cmpneq_epi64_mask (_mm512_and_epi32 (__A, __B), - _mm512_setzero_si512()); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS512 -_mm512_mask_test_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B) -{ - return _mm512_mask_cmpneq_epi64_mask (__U, _mm512_and_epi32 (__A, __B), - _mm512_setzero_si512()); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS512 -_mm512_testn_epi32_mask (__m512i __A, __m512i __B) -{ - return _mm512_cmpeq_epi32_mask (_mm512_and_epi32 (__A, __B), - _mm512_setzero_si512()); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS512 -_mm512_mask_testn_epi32_mask (__mmask16 __U, __m512i __A, __m512i __B) -{ - return _mm512_mask_cmpeq_epi32_mask (__U, _mm512_and_epi32 (__A, __B), - _mm512_setzero_si512()); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS512 -_mm512_testn_epi64_mask (__m512i __A, __m512i __B) -{ - return _mm512_cmpeq_epi64_mask (_mm512_and_epi32 (__A, __B), - _mm512_setzero_si512()); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS512 -_mm512_mask_testn_epi64_mask (__mmask8 __U, __m512i __A, __m512i __B) -{ - return _mm512_mask_cmpeq_epi64_mask (__U, _mm512_and_epi32 (__A, __B), - _mm512_setzero_si512()); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_movehdup_ps (__m512 __A) -{ - return (__m512)__builtin_shufflevector((__v16sf)__A, (__v16sf)__A, - 1, 1, 3, 3, 5, 5, 7, 7, 9, 9, 11, 11, 13, 13, 15, 15); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_movehdup_ps (__m512 __W, __mmask16 __U, __m512 __A) -{ - return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_movehdup_ps(__A), - (__v16sf)__W); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_maskz_movehdup_ps (__mmask16 __U, __m512 __A) -{ - return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_movehdup_ps(__A), - (__v16sf)_mm512_setzero_ps()); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_moveldup_ps (__m512 __A) -{ - return (__m512)__builtin_shufflevector((__v16sf)__A, (__v16sf)__A, - 0, 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_moveldup_ps (__m512 __W, __mmask16 __U, __m512 __A) -{ - return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_moveldup_ps(__A), - (__v16sf)__W); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_maskz_moveldup_ps (__mmask16 __U, __m512 __A) -{ - return (__m512)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_moveldup_ps(__A), - (__v16sf)_mm512_setzero_ps()); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_move_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) -{ - return __builtin_ia32_selectss_128(__U, _mm_move_ss(__A, __B), __W); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_move_ss (__mmask8 __U, __m128 __A, __m128 __B) -{ - return __builtin_ia32_selectss_128(__U, _mm_move_ss(__A, __B), - _mm_setzero_ps()); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_move_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) -{ - return __builtin_ia32_selectsd_128(__U, _mm_move_sd(__A, __B), __W); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_move_sd (__mmask8 __U, __m128d __A, __m128d __B) -{ - return __builtin_ia32_selectsd_128(__U, _mm_move_sd(__A, __B), - _mm_setzero_pd()); -} - -static __inline__ void __DEFAULT_FN_ATTRS128 -_mm_mask_store_ss (float * __W, __mmask8 __U, __m128 __A) -{ - __builtin_ia32_storess128_mask ((__v4sf *)__W, __A, __U & 1); -} - -static __inline__ void __DEFAULT_FN_ATTRS128 -_mm_mask_store_sd (double * __W, __mmask8 __U, __m128d __A) -{ - __builtin_ia32_storesd128_mask ((__v2df *)__W, __A, __U & 1); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_load_ss (__m128 __W, __mmask8 __U, const float* __A) -{ - __m128 src = (__v4sf) __builtin_shufflevector((__v4sf) __W, - (__v4sf)_mm_setzero_ps(), - 0, 4, 4, 4); - - return (__m128) __builtin_ia32_loadss128_mask ((const __v4sf *) __A, src, __U & 1); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_load_ss (__mmask8 __U, const float* __A) -{ - return (__m128)__builtin_ia32_loadss128_mask ((const __v4sf *) __A, - (__v4sf) _mm_setzero_ps(), - __U & 1); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_load_sd (__m128d __W, __mmask8 __U, const double* __A) -{ - __m128d src = (__v2df) __builtin_shufflevector((__v2df) __W, - (__v2df)_mm_setzero_pd(), - 0, 2); - - return (__m128d) __builtin_ia32_loadsd128_mask ((const __v2df *) __A, src, __U & 1); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_load_sd (__mmask8 __U, const double* __A) -{ - return (__m128d) __builtin_ia32_loadsd128_mask ((const __v2df *) __A, - (__v2df) _mm_setzero_pd(), - __U & 1); -} - -#define _mm512_shuffle_epi32(A, I) \ - ((__m512i)__builtin_ia32_pshufd512((__v16si)(__m512i)(A), (int)(I))) - -#define _mm512_mask_shuffle_epi32(W, U, A, I) \ - ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ - (__v16si)_mm512_shuffle_epi32((A), (I)), \ - (__v16si)(__m512i)(W))) - -#define _mm512_maskz_shuffle_epi32(U, A, I) \ - ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ - (__v16si)_mm512_shuffle_epi32((A), (I)), \ - (__v16si)_mm512_setzero_si512())) - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_expand_pd (__m512d __W, __mmask8 __U, __m512d __A) -{ - return (__m512d) __builtin_ia32_expanddf512_mask ((__v8df) __A, - (__v8df) __W, - (__mmask8) __U); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_maskz_expand_pd (__mmask8 __U, __m512d __A) -{ - return (__m512d) __builtin_ia32_expanddf512_mask ((__v8df) __A, - (__v8df) _mm512_setzero_pd (), - (__mmask8) __U); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_expand_epi64 (__m512i __W, __mmask8 __U, __m512i __A) -{ - return (__m512i) __builtin_ia32_expanddi512_mask ((__v8di) __A, - (__v8di) __W, - (__mmask8) __U); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_expand_epi64 ( __mmask8 __U, __m512i __A) -{ - return (__m512i) __builtin_ia32_expanddi512_mask ((__v8di) __A, - (__v8di) _mm512_setzero_si512 (), - (__mmask8) __U); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_expandloadu_pd(__m512d __W, __mmask8 __U, void const *__P) -{ - return (__m512d) __builtin_ia32_expandloaddf512_mask ((const __v8df *)__P, - (__v8df) __W, - (__mmask8) __U); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_maskz_expandloadu_pd(__mmask8 __U, void const *__P) -{ - return (__m512d) __builtin_ia32_expandloaddf512_mask ((const __v8df *)__P, - (__v8df) _mm512_setzero_pd(), - (__mmask8) __U); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_expandloadu_epi64(__m512i __W, __mmask8 __U, void const *__P) -{ - return (__m512i) __builtin_ia32_expandloaddi512_mask ((const __v8di *)__P, - (__v8di) __W, - (__mmask8) __U); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_expandloadu_epi64(__mmask8 __U, void const *__P) -{ - return (__m512i) __builtin_ia32_expandloaddi512_mask ((const __v8di *)__P, - (__v8di) _mm512_setzero_si512(), - (__mmask8) __U); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_expandloadu_ps(__m512 __W, __mmask16 __U, void const *__P) -{ - return (__m512) __builtin_ia32_expandloadsf512_mask ((const __v16sf *)__P, - (__v16sf) __W, - (__mmask16) __U); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_maskz_expandloadu_ps(__mmask16 __U, void const *__P) -{ - return (__m512) __builtin_ia32_expandloadsf512_mask ((const __v16sf *)__P, - (__v16sf) _mm512_setzero_ps(), - (__mmask16) __U); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_expandloadu_epi32(__m512i __W, __mmask16 __U, void const *__P) -{ - return (__m512i) __builtin_ia32_expandloadsi512_mask ((const __v16si *)__P, - (__v16si) __W, - (__mmask16) __U); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_expandloadu_epi32(__mmask16 __U, void const *__P) -{ - return (__m512i) __builtin_ia32_expandloadsi512_mask ((const __v16si *)__P, - (__v16si) _mm512_setzero_si512(), - (__mmask16) __U); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_expand_ps (__m512 __W, __mmask16 __U, __m512 __A) -{ - return (__m512) __builtin_ia32_expandsf512_mask ((__v16sf) __A, - (__v16sf) __W, - (__mmask16) __U); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_maskz_expand_ps (__mmask16 __U, __m512 __A) -{ - return (__m512) __builtin_ia32_expandsf512_mask ((__v16sf) __A, - (__v16sf) _mm512_setzero_ps(), - (__mmask16) __U); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_expand_epi32 (__m512i __W, __mmask16 __U, __m512i __A) -{ - return (__m512i) __builtin_ia32_expandsi512_mask ((__v16si) __A, - (__v16si) __W, - (__mmask16) __U); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_expand_epi32 (__mmask16 __U, __m512i __A) -{ - return (__m512i) __builtin_ia32_expandsi512_mask ((__v16si) __A, - (__v16si) _mm512_setzero_si512(), - (__mmask16) __U); -} - -#define _mm512_cvt_roundps_pd(A, R) \ - ((__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \ - (__v8df)_mm512_undefined_pd(), \ - (__mmask8)-1, (int)(R))) - -#define _mm512_mask_cvt_roundps_pd(W, U, A, R) \ - ((__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \ - (__v8df)(__m512d)(W), \ - (__mmask8)(U), (int)(R))) - -#define _mm512_maskz_cvt_roundps_pd(U, A, R) \ - ((__m512d)__builtin_ia32_cvtps2pd512_mask((__v8sf)(__m256)(A), \ - (__v8df)_mm512_setzero_pd(), \ - (__mmask8)(U), (int)(R))) - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_cvtps_pd (__m256 __A) -{ - return (__m512d) __builtin_convertvector((__v8sf)__A, __v8df); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtps_pd (__m512d __W, __mmask8 __U, __m256 __A) -{ - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, - (__v8df)_mm512_cvtps_pd(__A), - (__v8df)__W); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtps_pd (__mmask8 __U, __m256 __A) -{ - return (__m512d)__builtin_ia32_selectpd_512((__mmask8)__U, - (__v8df)_mm512_cvtps_pd(__A), - (__v8df)_mm512_setzero_pd()); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_cvtpslo_pd (__m512 __A) -{ - return (__m512d) _mm512_cvtps_pd(_mm512_castps512_ps256(__A)); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtpslo_pd (__m512d __W, __mmask8 __U, __m512 __A) -{ - return (__m512d) _mm512_mask_cvtps_pd(__W, __U, _mm512_castps512_ps256(__A)); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_mov_pd (__m512d __W, __mmask8 __U, __m512d __A) -{ - return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U, - (__v8df) __A, - (__v8df) __W); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_maskz_mov_pd (__mmask8 __U, __m512d __A) -{ - return (__m512d) __builtin_ia32_selectpd_512 ((__mmask8) __U, - (__v8df) __A, - (__v8df) _mm512_setzero_pd ()); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_mov_ps (__m512 __W, __mmask16 __U, __m512 __A) -{ - return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U, - (__v16sf) __A, - (__v16sf) __W); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_maskz_mov_ps (__mmask16 __U, __m512 __A) -{ - return (__m512) __builtin_ia32_selectps_512 ((__mmask16) __U, - (__v16sf) __A, - (__v16sf) _mm512_setzero_ps ()); -} - -static __inline__ void __DEFAULT_FN_ATTRS512 -_mm512_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m512d __A) -{ - __builtin_ia32_compressstoredf512_mask ((__v8df *) __P, (__v8df) __A, - (__mmask8) __U); -} - -static __inline__ void __DEFAULT_FN_ATTRS512 -_mm512_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m512i __A) -{ - __builtin_ia32_compressstoredi512_mask ((__v8di *) __P, (__v8di) __A, - (__mmask8) __U); -} - -static __inline__ void __DEFAULT_FN_ATTRS512 -_mm512_mask_compressstoreu_ps (void *__P, __mmask16 __U, __m512 __A) -{ - __builtin_ia32_compressstoresf512_mask ((__v16sf *) __P, (__v16sf) __A, - (__mmask16) __U); -} - -static __inline__ void __DEFAULT_FN_ATTRS512 -_mm512_mask_compressstoreu_epi32 (void *__P, __mmask16 __U, __m512i __A) -{ - __builtin_ia32_compressstoresi512_mask ((__v16si *) __P, (__v16si) __A, - (__mmask16) __U); -} - -#define _mm_cvt_roundsd_ss(A, B, R) \ - ((__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \ - (__v2df)(__m128d)(B), \ - (__v4sf)_mm_undefined_ps(), \ - (__mmask8)-1, (int)(R))) - -#define _mm_mask_cvt_roundsd_ss(W, U, A, B, R) \ - ((__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \ - (__v2df)(__m128d)(B), \ - (__v4sf)(__m128)(W), \ - (__mmask8)(U), (int)(R))) - -#define _mm_maskz_cvt_roundsd_ss(U, A, B, R) \ - ((__m128)__builtin_ia32_cvtsd2ss_round_mask((__v4sf)(__m128)(A), \ - (__v2df)(__m128d)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U), (int)(R))) - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_cvtsd_ss (__m128 __W, __mmask8 __U, __m128 __A, __m128d __B) -{ - return __builtin_ia32_cvtsd2ss_round_mask ((__v4sf)__A, - (__v2df)__B, - (__v4sf)__W, - (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtsd_ss (__mmask8 __U, __m128 __A, __m128d __B) -{ - return __builtin_ia32_cvtsd2ss_round_mask ((__v4sf)__A, - (__v2df)__B, - (__v4sf)_mm_setzero_ps(), - (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_cvtss_i32 _mm_cvtss_si32 -#define _mm_cvtsd_i32 _mm_cvtsd_si32 -#define _mm_cvti32_sd _mm_cvtsi32_sd -#define _mm_cvti32_ss _mm_cvtsi32_ss -#ifdef __x86_64__ -#define _mm_cvtss_i64 _mm_cvtss_si64 -#define _mm_cvtsd_i64 _mm_cvtsd_si64 -#define _mm_cvti64_sd _mm_cvtsi64_sd -#define _mm_cvti64_ss _mm_cvtsi64_ss -#endif - -#ifdef __x86_64__ -#define _mm_cvt_roundi64_sd(A, B, R) \ - ((__m128d)__builtin_ia32_cvtsi2sd64((__v2df)(__m128d)(A), (long long)(B), \ - (int)(R))) - -#define _mm_cvt_roundsi64_sd(A, B, R) \ - ((__m128d)__builtin_ia32_cvtsi2sd64((__v2df)(__m128d)(A), (long long)(B), \ - (int)(R))) -#endif - -#define _mm_cvt_roundsi32_ss(A, B, R) \ - ((__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R))) - -#define _mm_cvt_roundi32_ss(A, B, R) \ - ((__m128)__builtin_ia32_cvtsi2ss32((__v4sf)(__m128)(A), (int)(B), (int)(R))) - -#ifdef __x86_64__ -#define _mm_cvt_roundsi64_ss(A, B, R) \ - ((__m128)__builtin_ia32_cvtsi2ss64((__v4sf)(__m128)(A), (long long)(B), \ - (int)(R))) - -#define _mm_cvt_roundi64_ss(A, B, R) \ - ((__m128)__builtin_ia32_cvtsi2ss64((__v4sf)(__m128)(A), (long long)(B), \ - (int)(R))) -#endif - -#define _mm_cvt_roundss_sd(A, B, R) \ - ((__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \ - (__v4sf)(__m128)(B), \ - (__v2df)_mm_undefined_pd(), \ - (__mmask8)-1, (int)(R))) - -#define _mm_mask_cvt_roundss_sd(W, U, A, B, R) \ - ((__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \ - (__v4sf)(__m128)(B), \ - (__v2df)(__m128d)(W), \ - (__mmask8)(U), (int)(R))) - -#define _mm_maskz_cvt_roundss_sd(U, A, B, R) \ - ((__m128d)__builtin_ia32_cvtss2sd_round_mask((__v2df)(__m128d)(A), \ - (__v4sf)(__m128)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U), (int)(R))) - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_cvtss_sd (__m128d __W, __mmask8 __U, __m128d __A, __m128 __B) -{ - return __builtin_ia32_cvtss2sd_round_mask((__v2df)__A, - (__v4sf)__B, - (__v2df)__W, - (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtss_sd (__mmask8 __U, __m128d __A, __m128 __B) -{ - return __builtin_ia32_cvtss2sd_round_mask((__v2df)__A, - (__v4sf)__B, - (__v2df)_mm_setzero_pd(), - (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_cvtu32_sd (__m128d __A, unsigned __B) -{ - __A[0] = __B; - return __A; -} - -#ifdef __x86_64__ -#define _mm_cvt_roundu64_sd(A, B, R) \ - ((__m128d)__builtin_ia32_cvtusi2sd64((__v2df)(__m128d)(A), \ - (unsigned long long)(B), (int)(R))) - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_cvtu64_sd (__m128d __A, unsigned long long __B) -{ - __A[0] = __B; - return __A; -} -#endif - -#define _mm_cvt_roundu32_ss(A, B, R) \ - ((__m128)__builtin_ia32_cvtusi2ss32((__v4sf)(__m128)(A), (unsigned int)(B), \ - (int)(R))) - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_cvtu32_ss (__m128 __A, unsigned __B) -{ - __A[0] = __B; - return __A; -} - -#ifdef __x86_64__ -#define _mm_cvt_roundu64_ss(A, B, R) \ - ((__m128)__builtin_ia32_cvtusi2ss64((__v4sf)(__m128)(A), \ - (unsigned long long)(B), (int)(R))) - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_cvtu64_ss (__m128 __A, unsigned long long __B) -{ - __A[0] = __B; - return __A; -} -#endif - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_set1_epi32 (__m512i __O, __mmask16 __M, int __A) -{ - return (__m512i) __builtin_ia32_selectd_512(__M, - (__v16si) _mm512_set1_epi32(__A), - (__v16si) __O); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_set1_epi64 (__m512i __O, __mmask8 __M, long long __A) -{ - return (__m512i) __builtin_ia32_selectq_512(__M, - (__v8di) _mm512_set1_epi64(__A), - (__v8di) __O); -} - -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_set_epi8 (char __e63, char __e62, char __e61, char __e60, char __e59, - char __e58, char __e57, char __e56, char __e55, char __e54, char __e53, - char __e52, char __e51, char __e50, char __e49, char __e48, char __e47, - char __e46, char __e45, char __e44, char __e43, char __e42, char __e41, - char __e40, char __e39, char __e38, char __e37, char __e36, char __e35, - char __e34, char __e33, char __e32, char __e31, char __e30, char __e29, - char __e28, char __e27, char __e26, char __e25, char __e24, char __e23, - char __e22, char __e21, char __e20, char __e19, char __e18, char __e17, - char __e16, char __e15, char __e14, char __e13, char __e12, char __e11, - char __e10, char __e9, char __e8, char __e7, char __e6, char __e5, - char __e4, char __e3, char __e2, char __e1, char __e0) { - - return __extension__ (__m512i)(__v64qi) - {__e0, __e1, __e2, __e3, __e4, __e5, __e6, __e7, - __e8, __e9, __e10, __e11, __e12, __e13, __e14, __e15, - __e16, __e17, __e18, __e19, __e20, __e21, __e22, __e23, - __e24, __e25, __e26, __e27, __e28, __e29, __e30, __e31, - __e32, __e33, __e34, __e35, __e36, __e37, __e38, __e39, - __e40, __e41, __e42, __e43, __e44, __e45, __e46, __e47, - __e48, __e49, __e50, __e51, __e52, __e53, __e54, __e55, - __e56, __e57, __e58, __e59, __e60, __e61, __e62, __e63}; -} - -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_set_epi16(short __e31, short __e30, short __e29, short __e28, - short __e27, short __e26, short __e25, short __e24, short __e23, - short __e22, short __e21, short __e20, short __e19, short __e18, - short __e17, short __e16, short __e15, short __e14, short __e13, - short __e12, short __e11, short __e10, short __e9, short __e8, - short __e7, short __e6, short __e5, short __e4, short __e3, - short __e2, short __e1, short __e0) { - return __extension__ (__m512i)(__v32hi) - {__e0, __e1, __e2, __e3, __e4, __e5, __e6, __e7, - __e8, __e9, __e10, __e11, __e12, __e13, __e14, __e15, - __e16, __e17, __e18, __e19, __e20, __e21, __e22, __e23, - __e24, __e25, __e26, __e27, __e28, __e29, __e30, __e31 }; -} - -static __inline __m512i __DEFAULT_FN_ATTRS512 -_mm512_set_epi32 (int __A, int __B, int __C, int __D, - int __E, int __F, int __G, int __H, - int __I, int __J, int __K, int __L, - int __M, int __N, int __O, int __P) -{ - return __extension__ (__m512i)(__v16si) - { __P, __O, __N, __M, __L, __K, __J, __I, - __H, __G, __F, __E, __D, __C, __B, __A }; -} - -#define _mm512_setr_epi32(e0,e1,e2,e3,e4,e5,e6,e7, \ - e8,e9,e10,e11,e12,e13,e14,e15) \ - _mm512_set_epi32((e15),(e14),(e13),(e12),(e11),(e10),(e9),(e8),(e7),(e6), \ - (e5),(e4),(e3),(e2),(e1),(e0)) - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_set_epi64 (long long __A, long long __B, long long __C, - long long __D, long long __E, long long __F, - long long __G, long long __H) -{ - return __extension__ (__m512i) (__v8di) - { __H, __G, __F, __E, __D, __C, __B, __A }; -} - -#define _mm512_setr_epi64(e0,e1,e2,e3,e4,e5,e6,e7) \ - _mm512_set_epi64((e7),(e6),(e5),(e4),(e3),(e2),(e1),(e0)) - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_set_pd (double __A, double __B, double __C, double __D, - double __E, double __F, double __G, double __H) -{ - return __extension__ (__m512d) - { __H, __G, __F, __E, __D, __C, __B, __A }; -} - -#define _mm512_setr_pd(e0,e1,e2,e3,e4,e5,e6,e7) \ - _mm512_set_pd((e7),(e6),(e5),(e4),(e3),(e2),(e1),(e0)) - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_set_ps (float __A, float __B, float __C, float __D, - float __E, float __F, float __G, float __H, - float __I, float __J, float __K, float __L, - float __M, float __N, float __O, float __P) -{ - return __extension__ (__m512) - { __P, __O, __N, __M, __L, __K, __J, __I, - __H, __G, __F, __E, __D, __C, __B, __A }; -} - -#define _mm512_setr_ps(e0,e1,e2,e3,e4,e5,e6,e7,e8,e9,e10,e11,e12,e13,e14,e15) \ - _mm512_set_ps((e15),(e14),(e13),(e12),(e11),(e10),(e9),(e8),(e7),(e6),(e5), \ - (e4),(e3),(e2),(e1),(e0)) - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_abs_ps(__m512 __A) -{ - return (__m512)_mm512_and_epi32(_mm512_set1_epi32(0x7FFFFFFF),(__m512i)__A) ; -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_abs_ps(__m512 __W, __mmask16 __K, __m512 __A) -{ - return (__m512)_mm512_mask_and_epi32((__m512i)__W, __K, _mm512_set1_epi32(0x7FFFFFFF),(__m512i)__A) ; -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_abs_pd(__m512d __A) -{ - return (__m512d)_mm512_and_epi64(_mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)__A) ; -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_abs_pd(__m512d __W, __mmask8 __K, __m512d __A) -{ - return (__m512d)_mm512_mask_and_epi64((__v8di)__W, __K, _mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),(__v8di)__A); -} - -/* Vector-reduction arithmetic accepts vectors as inputs and produces scalars as - * outputs. This class of vector operation forms the basis of many scientific - * computations. In vector-reduction arithmetic, the evaluation order is - * independent of the order of the input elements of V. - - * For floating-point intrinsics: - * 1. When using fadd/fmul intrinsics, the order of operations within the - * vector is unspecified (associative math). - * 2. When using fmin/fmax intrinsics, NaN or -0.0 elements within the vector - * produce unspecified results. - - * Used bisection method. At each step, we partition the vector with previous - * step in half, and the operation is performed on its two halves. - * This takes log2(n) steps where n is the number of elements in the vector. - */ - -static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_add_epi64(__m512i __W) { -#if (__clang_major__ > 14) - return __builtin_reduce_add((__v8di)__W); -#else - return __builtin_ia32_reduce_add_q512(__W); -#endif -} - -static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_epi64(__m512i __W) { -#if (__clang_major__ > 14) - return __builtin_reduce_mul((__v8di)__W); -#else - return __builtin_ia32_reduce_mul_q512(__W); -#endif -} - -static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_and_epi64(__m512i __W) { -#if (__clang_major__ < 14) - return __builtin_ia32_reduce_and_q512(__W); -#else - return __builtin_reduce_and((__v8di)__W); -#endif -} - -static __inline__ long long __DEFAULT_FN_ATTRS512 _mm512_reduce_or_epi64(__m512i __W) { -#if (__clang_major__ < 14) - return __builtin_ia32_reduce_or_q512(__W); -#else - return __builtin_reduce_or((__v8di)__W); -#endif -} - -static __inline__ long long __DEFAULT_FN_ATTRS512 -_mm512_mask_reduce_add_epi64(__mmask8 __M, __m512i __W) { - __W = _mm512_maskz_mov_epi64(__M, __W); -#if (__clang_major__ > 14) - return __builtin_reduce_add((__v8di)__W); -#else - return __builtin_ia32_reduce_add_q512(__W); -#endif -} - -static __inline__ long long __DEFAULT_FN_ATTRS512 -_mm512_mask_reduce_mul_epi64(__mmask8 __M, __m512i __W) { - __W = _mm512_mask_mov_epi64(_mm512_set1_epi64(1), __M, __W); -#if (__clang_major__ > 14) - return __builtin_reduce_mul((__v8di)__W); -#else - return __builtin_ia32_reduce_mul_q512(__W); -#endif -} - -static __inline__ long long __DEFAULT_FN_ATTRS512 -_mm512_mask_reduce_and_epi64(__mmask8 __M, __m512i __W) { - __W = _mm512_mask_mov_epi64(_mm512_set1_epi64(~0ULL), __M, __W); -#if (__clang_major__ < 14) - return __builtin_ia32_reduce_and_q512(__W); -#else - return __builtin_reduce_and((__v8di)__W); -#endif -} - -static __inline__ long long __DEFAULT_FN_ATTRS512 -_mm512_mask_reduce_or_epi64(__mmask8 __M, __m512i __W) { - __W = _mm512_maskz_mov_epi64(__M, __W); -#if (__clang_major__ < 14) - return __builtin_ia32_reduce_or_q512(__W); -#else - return __builtin_reduce_or((__v8di)__W); -#endif -} - -// -0.0 is used to ignore the start value since it is the neutral value of -// floating point addition. For more information, please refer to -// https://llvm.org/docs/LangRef.html#llvm-vector-reduce-fadd-intrinsic -static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_reduce_add_pd(__m512d __W) { - return __builtin_ia32_reduce_fadd_pd512(-0.0, __W); -} - -static __inline__ double __DEFAULT_FN_ATTRS512 _mm512_reduce_mul_pd(__m512d __W) { - return __builtin_ia32_reduce_fmul_pd512(1.0, __W); -} - -static __inline__ double __DEFAULT_FN_ATTRS512 -_mm512_mask_reduce_add_pd(__mmask8 __M, __m512d __W) { - __W = _mm512_maskz_mov_pd(__M, __W); - return __builtin_ia32_reduce_fadd_pd512(-0.0, __W); -} - -static __inline__ double __DEFAULT_FN_ATTRS512 -_mm512_mask_reduce_mul_pd(__mmask8 __M, __m512d __W) { - __W = _mm512_mask_mov_pd(_mm512_set1_pd(1.0), __M, __W); - return __builtin_ia32_reduce_fmul_pd512(1.0, __W); -} - -static __inline__ int __DEFAULT_FN_ATTRS512 -_mm512_reduce_add_epi32(__m512i __W) { -#if (__clang_major__ > 14) - return __builtin_reduce_add((__v16si)__W); -#else - return __builtin_ia32_reduce_add_d512((__v16si)__W); -#endif -} - -static __inline__ int __DEFAULT_FN_ATTRS512 -_mm512_reduce_mul_epi32(__m512i __W) { -#if (__clang_major__ > 14) - return __builtin_reduce_mul((__v16si)__W); -#else - return __builtin_ia32_reduce_mul_d512((__v16si)__W); -#endif -} - -static __inline__ int __DEFAULT_FN_ATTRS512 -_mm512_reduce_and_epi32(__m512i __W) { -#if (__clang_major__ < 14) - return __builtin_ia32_reduce_and_d512((__v16si)__W); -#else - return __builtin_reduce_and((__v16si)__W); -#endif -} - -static __inline__ int __DEFAULT_FN_ATTRS512 -_mm512_reduce_or_epi32(__m512i __W) { -#if (__clang_major__ < 14) - return __builtin_ia32_reduce_or_d512((__v16si)__W); -#else - return __builtin_reduce_or((__v16si)__W); -#endif -} - -static __inline__ int __DEFAULT_FN_ATTRS512 -_mm512_mask_reduce_add_epi32( __mmask16 __M, __m512i __W) { - __W = _mm512_maskz_mov_epi32(__M, __W); -#if (__clang_major__ > 14) - return __builtin_reduce_add((__v16si)__W); -#else - return __builtin_ia32_reduce_add_d512((__v16si)__W); -#endif -} - -static __inline__ int __DEFAULT_FN_ATTRS512 -_mm512_mask_reduce_mul_epi32( __mmask16 __M, __m512i __W) { - __W = _mm512_mask_mov_epi32(_mm512_set1_epi32(1), __M, __W); -#if (__clang_major__ > 14) - return __builtin_reduce_mul((__v16si)__W); -#else - return __builtin_ia32_reduce_mul_d512((__v16si)__W); -#endif -} - -static __inline__ int __DEFAULT_FN_ATTRS512 -_mm512_mask_reduce_and_epi32( __mmask16 __M, __m512i __W) { - __W = _mm512_mask_mov_epi32(_mm512_set1_epi32(~0U), __M, __W); -#if (__clang_major__ < 14) - return __builtin_ia32_reduce_and_d512((__v16si)__W); -#else - return __builtin_reduce_and((__v16si)__W); -#endif -} - -static __inline__ int __DEFAULT_FN_ATTRS512 -_mm512_mask_reduce_or_epi32(__mmask16 __M, __m512i __W) { - __W = _mm512_maskz_mov_epi32(__M, __W); -#if (__clang_major__ < 14) - return __builtin_ia32_reduce_or_d512((__v16si)__W); -#else - return __builtin_reduce_or((__v16si)__W); -#endif -} - -static __inline__ float __DEFAULT_FN_ATTRS512 -_mm512_reduce_add_ps(__m512 __W) { - return __builtin_ia32_reduce_fadd_ps512(-0.0f, __W); -} - -static __inline__ float __DEFAULT_FN_ATTRS512 -_mm512_reduce_mul_ps(__m512 __W) { - return __builtin_ia32_reduce_fmul_ps512(1.0f, __W); -} - -static __inline__ float __DEFAULT_FN_ATTRS512 -_mm512_mask_reduce_add_ps(__mmask16 __M, __m512 __W) { - __W = _mm512_maskz_mov_ps(__M, __W); - return __builtin_ia32_reduce_fadd_ps512(-0.0f, __W); -} - -static __inline__ float __DEFAULT_FN_ATTRS512 -_mm512_mask_reduce_mul_ps(__mmask16 __M, __m512 __W) { - __W = _mm512_mask_mov_ps(_mm512_set1_ps(1.0f), __M, __W); - return __builtin_ia32_reduce_fmul_ps512(1.0f, __W); -} - -static __inline__ long long __DEFAULT_FN_ATTRS512 -_mm512_reduce_max_epi64(__m512i __V) { -#if (__clang_major__ < 14) - return __builtin_ia32_reduce_smax_q512(__V); -#else - return __builtin_reduce_max((__v8di)__V); -#endif -} - -static __inline__ unsigned long long __DEFAULT_FN_ATTRS512 -_mm512_reduce_max_epu64(__m512i __V) { -#if (__clang_major__ < 14) - return __builtin_ia32_reduce_umax_q512(__V); -#else - return __builtin_reduce_max((__v8du)__V); -#endif -} - -static __inline__ long long __DEFAULT_FN_ATTRS512 -_mm512_reduce_min_epi64(__m512i __V) { -#if (__clang_major__ < 14) - return __builtin_ia32_reduce_smin_q512(__V); -#else - return __builtin_reduce_min((__v8di)__V); -#endif -} - -static __inline__ unsigned long long __DEFAULT_FN_ATTRS512 -_mm512_reduce_min_epu64(__m512i __V) { -#if (__clang_major__ < 14) - return __builtin_ia32_reduce_umin_q512(__V); -#else - return __builtin_reduce_min((__v8du)__V); -#endif -} - -static __inline__ long long __DEFAULT_FN_ATTRS512 -_mm512_mask_reduce_max_epi64(__mmask8 __M, __m512i __V) { - __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(-__LONG_LONG_MAX__ - 1LL), __M, __V); -#if (__clang_major__ < 14) - return __builtin_ia32_reduce_smax_q512(__V); -#else - return __builtin_reduce_max((__v8di)__V); -#endif -} - -static __inline__ unsigned long long __DEFAULT_FN_ATTRS512 -_mm512_mask_reduce_max_epu64(__mmask8 __M, __m512i __V) { - __V = _mm512_maskz_mov_epi64(__M, __V); -#if (__clang_major__ < 14) - return __builtin_ia32_reduce_umax_q512(__V); -#else - return __builtin_reduce_max((__v8du)__V); -#endif -} - -static __inline__ long long __DEFAULT_FN_ATTRS512 -_mm512_mask_reduce_min_epi64(__mmask8 __M, __m512i __V) { - __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(__LONG_LONG_MAX__), __M, __V); -#if (__clang_major__ < 14) - return __builtin_ia32_reduce_smin_q512(__V); -#else - return __builtin_reduce_min((__v8di)__V); -#endif -} - -static __inline__ unsigned long long __DEFAULT_FN_ATTRS512 -_mm512_mask_reduce_min_epu64(__mmask8 __M, __m512i __V) { - __V = _mm512_mask_mov_epi64(_mm512_set1_epi64(~0ULL), __M, __V); -#if (__clang_major__ < 14) - return __builtin_ia32_reduce_umin_q512(__V); -#else - return __builtin_reduce_min((__v8du)__V); -#endif -} -static __inline__ int __DEFAULT_FN_ATTRS512 -_mm512_reduce_max_epi32(__m512i __V) { -#if (__clang_major__ < 14) - return __builtin_ia32_reduce_smax_d512((__v16si)__V); -#else - return __builtin_reduce_max((__v16si)__V); -#endif -} - -static __inline__ unsigned int __DEFAULT_FN_ATTRS512 -_mm512_reduce_max_epu32(__m512i __V) { -#if (__clang_major__ < 14) - return __builtin_ia32_reduce_umax_d512((__v16si)__V); -#else - return __builtin_reduce_max((__v16su)__V); -#endif -} - -static __inline__ int __DEFAULT_FN_ATTRS512 -_mm512_reduce_min_epi32(__m512i __V) { -#if (__clang_major__ < 14) - return __builtin_ia32_reduce_smin_d512((__v16si)__V); -#else - return __builtin_reduce_min((__v16si)__V); -#endif -} - -static __inline__ unsigned int __DEFAULT_FN_ATTRS512 -_mm512_reduce_min_epu32(__m512i __V) { -#if (__clang_major__ < 14) - return __builtin_ia32_reduce_umin_d512((__v16si)__V); -#else - return __builtin_reduce_min((__v16su)__V); -#endif -} - -static __inline__ int __DEFAULT_FN_ATTRS512 -_mm512_mask_reduce_max_epi32(__mmask16 __M, __m512i __V) { - __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(-__INT_MAX__ - 1), __M, __V); -#if (__clang_major__ < 14) - return __builtin_ia32_reduce_smax_d512((__v16si)__V); -#else - return __builtin_reduce_max((__v16si)__V); -#endif -} - -static __inline__ unsigned int __DEFAULT_FN_ATTRS512 -_mm512_mask_reduce_max_epu32(__mmask16 __M, __m512i __V) { - __V = _mm512_maskz_mov_epi32(__M, __V); -#if (__clang_major__ < 14) - return __builtin_ia32_reduce_umax_d512((__v16si)__V); -#else - return __builtin_reduce_max((__v16su)__V); -#endif -} - -static __inline__ int __DEFAULT_FN_ATTRS512 -_mm512_mask_reduce_min_epi32(__mmask16 __M, __m512i __V) { - __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(__INT_MAX__), __M, __V); -#if (__clang_major__ < 14) - return __builtin_ia32_reduce_smin_d512((__v16si)__V); -#else - return __builtin_reduce_min((__v16si)__V); -#endif -} - -static __inline__ unsigned int __DEFAULT_FN_ATTRS512 -_mm512_mask_reduce_min_epu32(__mmask16 __M, __m512i __V) { - __V = _mm512_mask_mov_epi32(_mm512_set1_epi32(~0U), __M, __V); -#if (__clang_major__ < 14) - return __builtin_ia32_reduce_umin_d512((__v16si)__V); -#else - return __builtin_reduce_min((__v16su)__V); -#endif -} - -static __inline__ double __DEFAULT_FN_ATTRS512 -_mm512_reduce_max_pd(__m512d __V) { - return __builtin_ia32_reduce_fmax_pd512(__V); -} - -static __inline__ double __DEFAULT_FN_ATTRS512 -_mm512_reduce_min_pd(__m512d __V) { - return __builtin_ia32_reduce_fmin_pd512(__V); -} - -static __inline__ double __DEFAULT_FN_ATTRS512 -_mm512_mask_reduce_max_pd(__mmask8 __M, __m512d __V) { - __V = _mm512_mask_mov_pd(_mm512_set1_pd(-__builtin_inf()), __M, __V); - return __builtin_ia32_reduce_fmax_pd512(__V); -} - -static __inline__ double __DEFAULT_FN_ATTRS512 -_mm512_mask_reduce_min_pd(__mmask8 __M, __m512d __V) { - __V = _mm512_mask_mov_pd(_mm512_set1_pd(__builtin_inf()), __M, __V); - return __builtin_ia32_reduce_fmin_pd512(__V); -} - -static __inline__ float __DEFAULT_FN_ATTRS512 -_mm512_reduce_max_ps(__m512 __V) { - return __builtin_ia32_reduce_fmax_ps512(__V); -} - -static __inline__ float __DEFAULT_FN_ATTRS512 -_mm512_reduce_min_ps(__m512 __V) { - return __builtin_ia32_reduce_fmin_ps512(__V); -} - -static __inline__ float __DEFAULT_FN_ATTRS512 -_mm512_mask_reduce_max_ps(__mmask16 __M, __m512 __V) { - __V = _mm512_mask_mov_ps(_mm512_set1_ps(-__builtin_inff()), __M, __V); - return __builtin_ia32_reduce_fmax_ps512(__V); -} - -static __inline__ float __DEFAULT_FN_ATTRS512 -_mm512_mask_reduce_min_ps(__mmask16 __M, __m512 __V) { - __V = _mm512_mask_mov_ps(_mm512_set1_ps(__builtin_inff()), __M, __V); - return __builtin_ia32_reduce_fmin_ps512(__V); -} - -/// Moves the least significant 32 bits of a vector of [16 x i32] to a -/// 32-bit signed integer value. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVD / MOVD instruction. -/// -/// \param __A -/// A vector of [16 x i32]. The least significant 32 bits are moved to the -/// destination. -/// \returns A 32-bit signed integer containing the moved value. -static __inline__ int __DEFAULT_FN_ATTRS512 -_mm512_cvtsi512_si32(__m512i __A) { - __v16si __b = (__v16si)__A; - return __b[0]; -} - -/// Loads 8 double-precision (64-bit) floating-point elements stored at memory -/// locations starting at location \a base_addr at packed 32-bit integer indices -/// stored in the lower half of \a vindex scaled by \a scale them in dst. -/// -/// This intrinsic corresponds to the VGATHERDPD instructions. -/// -/// \operation -/// FOR j := 0 to 7 -/// i := j*64 -/// m := j*32 -/// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 -/// dst[i+63:i] := MEM[addr+63:addr] -/// ENDFOR -/// dst[MAX:512] := 0 -/// \endoperation -#define _mm512_i32logather_pd(vindex, base_addr, scale) \ - _mm512_i32gather_pd(_mm512_castsi512_si256(vindex), (base_addr), (scale)) - -/// Loads 8 double-precision (64-bit) floating-point elements from memory -/// starting at location \a base_addr at packed 32-bit integer indices stored in -/// the lower half of \a vindex scaled by \a scale into dst using writemask -/// \a mask (elements are copied from \a src when the corresponding mask bit is -/// not set). -/// -/// This intrinsic corresponds to the VGATHERDPD instructions. -/// -/// \operation -/// FOR j := 0 to 7 -/// i := j*64 -/// m := j*32 -/// IF mask[j] -/// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 -/// dst[i+63:i] := MEM[addr+63:addr] -/// ELSE -/// dst[i+63:i] := src[i+63:i] -/// FI -/// ENDFOR -/// dst[MAX:512] := 0 -/// \endoperation -#define _mm512_mask_i32logather_pd(src, mask, vindex, base_addr, scale) \ - _mm512_mask_i32gather_pd((src), (mask), _mm512_castsi512_si256(vindex), \ - (base_addr), (scale)) - -/// Loads 8 64-bit integer elements from memory starting at location \a base_addr -/// at packed 32-bit integer indices stored in the lower half of \a vindex -/// scaled by \a scale and stores them in dst. -/// -/// This intrinsic corresponds to the VPGATHERDQ instructions. -/// -/// \operation -/// FOR j := 0 to 7 -/// i := j*64 -/// m := j*32 -/// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 -/// dst[i+63:i] := MEM[addr+63:addr] -/// ENDFOR -/// dst[MAX:512] := 0 -/// \endoperation -#define _mm512_i32logather_epi64(vindex, base_addr, scale) \ - _mm512_i32gather_epi64(_mm512_castsi512_si256(vindex), (base_addr), (scale)) - -/// Loads 8 64-bit integer elements from memory starting at location \a base_addr -/// at packed 32-bit integer indices stored in the lower half of \a vindex -/// scaled by \a scale and stores them in dst using writemask \a mask (elements -/// are copied from \a src when the corresponding mask bit is not set). -/// -/// This intrinsic corresponds to the VPGATHERDQ instructions. -/// -/// \operation -/// FOR j := 0 to 7 -/// i := j*64 -/// m := j*32 -/// IF mask[j] -/// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 -/// dst[i+63:i] := MEM[addr+63:addr] -/// ELSE -/// dst[i+63:i] := src[i+63:i] -/// FI -/// ENDFOR -/// dst[MAX:512] := 0 -/// \endoperation -#define _mm512_mask_i32logather_epi64(src, mask, vindex, base_addr, scale) \ - _mm512_mask_i32gather_epi64((src), (mask), _mm512_castsi512_si256(vindex), \ - (base_addr), (scale)) - -/// Stores 8 packed double-precision (64-bit) floating-point elements in \a v1 -/// and to memory locations starting at location \a base_addr at packed 32-bit -/// integer indices stored in \a vindex scaled by \a scale. -/// -/// This intrinsic corresponds to the VSCATTERDPD instructions. -/// -/// \operation -/// FOR j := 0 to 7 -/// i := j*64 -/// m := j*32 -/// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 -/// MEM[addr+63:addr] := v1[i+63:i] -/// ENDFOR -/// \endoperation -#define _mm512_i32loscatter_pd(base_addr, vindex, v1, scale) \ - _mm512_i32scatter_pd((base_addr), _mm512_castsi512_si256(vindex), (v1), (scale)) - -/// Stores 8 packed double-precision (64-bit) floating-point elements in \a v1 -/// to memory locations starting at location \a base_addr at packed 32-bit -/// integer indices stored in \a vindex scaled by \a scale. Only those elements -/// whose corresponding mask bit is set in writemask \a mask are written to -/// memory. -/// -/// This intrinsic corresponds to the VSCATTERDPD instructions. -/// -/// \operation -/// FOR j := 0 to 7 -/// i := j*64 -/// m := j*32 -/// IF mask[j] -/// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 -/// MEM[addr+63:addr] := a[i+63:i] -/// FI -/// ENDFOR -/// \endoperation -#define _mm512_mask_i32loscatter_pd(base_addr, mask, vindex, v1, scale) \ - _mm512_mask_i32scatter_pd((base_addr), (mask), \ - _mm512_castsi512_si256(vindex), (v1), (scale)) - -/// Stores 8 packed 64-bit integer elements located in \a v1 and stores them in -/// memory locations starting at location \a base_addr at packed 32-bit integer -/// indices stored in \a vindex scaled by \a scale. -/// -/// This intrinsic corresponds to the VPSCATTERDQ instructions. -/// -/// \operation -/// FOR j := 0 to 7 -/// i := j*64 -/// m := j*32 -/// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 -/// MEM[addr+63:addr] := a[i+63:i] -/// ENDFOR -/// \endoperation -#define _mm512_i32loscatter_epi64(base_addr, vindex, v1, scale) \ - _mm512_i32scatter_epi64((base_addr), \ - _mm512_castsi512_si256(vindex), (v1), (scale)) - -/// Stores 8 packed 64-bit integer elements located in a and stores them in -/// memory locations starting at location \a base_addr at packed 32-bit integer -/// indices stored in \a vindex scaled by scale using writemask \a mask (elements -/// whose corresponding mask bit is not set are not written to memory). -/// -/// This intrinsic corresponds to the VPSCATTERDQ instructions. -/// -/// \operation -/// FOR j := 0 to 7 -/// i := j*64 -/// m := j*32 -/// IF mask[j] -/// addr := base_addr + SignExtend64(vindex[m+31:m]) * ZeroExtend64(scale) * 8 -/// MEM[addr+63:addr] := a[i+63:i] -/// FI -/// ENDFOR -/// \endoperation -#define _mm512_mask_i32loscatter_epi64(base_addr, mask, vindex, v1, scale) \ - _mm512_mask_i32scatter_epi64((base_addr), (mask), \ - _mm512_castsi512_si256(vindex), (v1), (scale)) - -#undef __DEFAULT_FN_ATTRS512 -#undef __DEFAULT_FN_ATTRS128 -#undef __DEFAULT_FN_ATTRS - -#endif /* __AVX512FINTRIN_H */ diff --git a/include/avx512fp16intrin.h b/include/avx512fp16intrin.h deleted file mode 100644 index 99409a3..0000000 --- a/include/avx512fp16intrin.h +++ /dev/null @@ -1,3349 +0,0 @@ -/*===----------- avx512fp16intrin.h - AVX512-FP16 intrinsics ---------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ -#ifndef __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __AVX512FP16INTRIN_H -#define __AVX512FP16INTRIN_H - -/* Define the default attributes for the functions in this file. */ -typedef _Float16 __v32hf __attribute__((__vector_size__(64), __aligned__(64))); -typedef _Float16 __m512h __attribute__((__vector_size__(64), __aligned__(64))); -typedef _Float16 __m512h_u __attribute__((__vector_size__(64), __aligned__(1))); -typedef _Float16 __v8hf __attribute__((__vector_size__(16), __aligned__(16))); -typedef _Float16 __m128h __attribute__((__vector_size__(16), __aligned__(16))); -typedef _Float16 __m128h_u __attribute__((__vector_size__(16), __aligned__(1))); -typedef _Float16 __v16hf __attribute__((__vector_size__(32), __aligned__(32))); -typedef _Float16 __m256h __attribute__((__vector_size__(32), __aligned__(32))); -typedef _Float16 __m256h_u __attribute__((__vector_size__(32), __aligned__(1))); - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS512 \ - __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"), \ - __min_vector_width__(512))) -#define __DEFAULT_FN_ATTRS256 \ - __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"), \ - __min_vector_width__(256))) -#define __DEFAULT_FN_ATTRS128 \ - __attribute__((__always_inline__, __nodebug__, __target__("avx512fp16"), \ - __min_vector_width__(128))) - -static __inline__ _Float16 __DEFAULT_FN_ATTRS512 _mm512_cvtsh_h(__m512h __a) { - return __a[0]; -} - -static __inline __m128h __DEFAULT_FN_ATTRS128 _mm_setzero_ph(void) { - return (__m128h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; -} - -static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_setzero_ph(void) { - return (__m256h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, - 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_undefined_ph(void) { - return (__m256h)__builtin_ia32_undef256(); -} - -static __inline __m512h __DEFAULT_FN_ATTRS512 _mm512_setzero_ph(void) { - return (__m512h){0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, - 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, - 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0}; -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_undefined_ph(void) { - return (__m128h)__builtin_ia32_undef128(); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_undefined_ph(void) { - return (__m512h)__builtin_ia32_undef512(); -} - -static __inline __m512h __DEFAULT_FN_ATTRS512 _mm512_set1_ph(_Float16 __h) { - return (__m512h)(__v32hf){__h, __h, __h, __h, __h, __h, __h, __h, - __h, __h, __h, __h, __h, __h, __h, __h, - __h, __h, __h, __h, __h, __h, __h, __h, - __h, __h, __h, __h, __h, __h, __h, __h}; -} - -static __inline __m512h __DEFAULT_FN_ATTRS512 -_mm512_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4, - _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8, - _Float16 __h9, _Float16 __h10, _Float16 __h11, _Float16 __h12, - _Float16 __h13, _Float16 __h14, _Float16 __h15, _Float16 __h16, - _Float16 __h17, _Float16 __h18, _Float16 __h19, _Float16 __h20, - _Float16 __h21, _Float16 __h22, _Float16 __h23, _Float16 __h24, - _Float16 __h25, _Float16 __h26, _Float16 __h27, _Float16 __h28, - _Float16 __h29, _Float16 __h30, _Float16 __h31, _Float16 __h32) { - return (__m512h)(__v32hf){__h32, __h31, __h30, __h29, __h28, __h27, __h26, - __h25, __h24, __h23, __h22, __h21, __h20, __h19, - __h18, __h17, __h16, __h15, __h14, __h13, __h12, - __h11, __h10, __h9, __h8, __h7, __h6, __h5, - __h4, __h3, __h2, __h1}; -} - -#define _mm512_setr_ph(h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11, h12, h13, \ - h14, h15, h16, h17, h18, h19, h20, h21, h22, h23, h24, \ - h25, h26, h27, h28, h29, h30, h31, h32) \ - _mm512_set_ph((h32), (h31), (h30), (h29), (h28), (h27), (h26), (h25), (h24), \ - (h23), (h22), (h21), (h20), (h19), (h18), (h17), (h16), (h15), \ - (h14), (h13), (h12), (h11), (h10), (h9), (h8), (h7), (h6), \ - (h5), (h4), (h3), (h2), (h1)) - -static __inline __m512h __DEFAULT_FN_ATTRS512 -_mm512_set1_pch(_Float16 _Complex h) { - return (__m512h)_mm512_set1_ps(__builtin_bit_cast(float, h)); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_castph_ps(__m128h __a) { - return (__m128)__a; -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_castph_ps(__m256h __a) { - return (__m256)__a; -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_castph_ps(__m512h __a) { - return (__m512)__a; -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_castph_pd(__m128h __a) { - return (__m128d)__a; -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_castph_pd(__m256h __a) { - return (__m256d)__a; -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_castph_pd(__m512h __a) { - return (__m512d)__a; -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_castph_si128(__m128h __a) { - return (__m128i)__a; -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_castph_si256(__m256h __a) { - return (__m256i)__a; -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_castph_si512(__m512h __a) { - return (__m512i)__a; -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castps_ph(__m128 __a) { - return (__m128h)__a; -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_castps_ph(__m256 __a) { - return (__m256h)__a; -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_castps_ph(__m512 __a) { - return (__m512h)__a; -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castpd_ph(__m128d __a) { - return (__m128h)__a; -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_castpd_ph(__m256d __a) { - return (__m256h)__a; -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_castpd_ph(__m512d __a) { - return (__m512h)__a; -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_castsi128_ph(__m128i __a) { - return (__m128h)__a; -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_castsi256_ph(__m256i __a) { - return (__m256h)__a; -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_castsi512_ph(__m512i __a) { - return (__m512h)__a; -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS256 -_mm256_castph256_ph128(__m256h __a) { - return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS512 -_mm512_castph512_ph128(__m512h __a) { - return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS512 -_mm512_castph512_ph256(__m512h __a) { - return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, - 12, 13, 14, 15); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_castph128_ph256(__m128h __a) { - return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1, - -1, -1, -1, -1, -1); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_castph128_ph512(__m128h __a) { - return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_castph256_ph512(__m256h __a) { - return __builtin_shufflevector(__a, __a, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, - 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1); -} - -/// Constructs a 256-bit floating-point vector of [16 x half] from a -/// 128-bit floating-point vector of [8 x half]. The lower 128 bits -/// contain the value of the source vector. The upper 384 bits are set -/// to zero. -/// -/// \headerfile -/// -/// This intrinsic has no corresponding instruction. -/// -/// \param __a -/// A 128-bit vector of [8 x half]. -/// \returns A 512-bit floating-point vector of [16 x half]. The lower 128 bits -/// contain the value of the parameter. The upper 384 bits are set to zero. -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_zextph128_ph256(__m128h __a) { - return __builtin_shufflevector(__a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4, - 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); -} - -/// Constructs a 512-bit floating-point vector of [32 x half] from a -/// 128-bit floating-point vector of [8 x half]. The lower 128 bits -/// contain the value of the source vector. The upper 384 bits are set -/// to zero. -/// -/// \headerfile -/// -/// This intrinsic has no corresponding instruction. -/// -/// \param __a -/// A 128-bit vector of [8 x half]. -/// \returns A 512-bit floating-point vector of [32 x half]. The lower 128 bits -/// contain the value of the parameter. The upper 384 bits are set to zero. -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_zextph128_ph512(__m128h __a) { - return __builtin_shufflevector( - __a, (__v8hf)_mm_setzero_ph(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, - 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15, 8, 9, 10, 11, 12, 13, 14, 15); -} - -/// Constructs a 512-bit floating-point vector of [32 x half] from a -/// 256-bit floating-point vector of [16 x half]. The lower 256 bits -/// contain the value of the source vector. The upper 256 bits are set -/// to zero. -/// -/// \headerfile -/// -/// This intrinsic has no corresponding instruction. -/// -/// \param __a -/// A 256-bit vector of [16 x half]. -/// \returns A 512-bit floating-point vector of [32 x half]. The lower 256 bits -/// contain the value of the parameter. The upper 256 bits are set to zero. -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_zextph256_ph512(__m256h __a) { - return __builtin_shufflevector(__a, (__v16hf)_mm256_setzero_ph(), 0, 1, 2, 3, - 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, - 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, - 29, 30, 31); -} - -#define _mm_comi_round_sh(A, B, P, R) \ - __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, (int)(P), (int)(R)) - -#define _mm_comi_sh(A, B, pred) \ - _mm_comi_round_sh((A), (B), (pred), _MM_FROUND_CUR_DIRECTION) - -static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comieq_sh(__m128h A, - __m128h B) { - return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_EQ_OS, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comilt_sh(__m128h A, - __m128h B) { - return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LT_OS, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comile_sh(__m128h A, - __m128h B) { - return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LE_OS, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comigt_sh(__m128h A, - __m128h B) { - return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GT_OS, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comige_sh(__m128h A, - __m128h B) { - return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GE_OS, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ int __DEFAULT_FN_ATTRS128 _mm_comineq_sh(__m128h A, - __m128h B) { - return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_NEQ_US, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomieq_sh(__m128h A, - __m128h B) { - return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_EQ_OQ, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomilt_sh(__m128h A, - __m128h B) { - return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LT_OQ, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomile_sh(__m128h A, - __m128h B) { - return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_LE_OQ, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomigt_sh(__m128h A, - __m128h B) { - return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GT_OQ, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomige_sh(__m128h A, - __m128h B) { - return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_GE_OQ, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ int __DEFAULT_FN_ATTRS128 _mm_ucomineq_sh(__m128h A, - __m128h B) { - return __builtin_ia32_vcomish((__v8hf)A, (__v8hf)B, _CMP_NEQ_UQ, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_add_ph(__m512h __A, - __m512h __B) { - return (__m512h)((__v32hf)__A + (__v32hf)__B); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_mask_add_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { - return (__m512h)__builtin_ia32_selectph_512( - (__mmask32)__U, (__v32hf)_mm512_add_ph(__A, __B), (__v32hf)__W); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_maskz_add_ph(__mmask32 __U, __m512h __A, __m512h __B) { - return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, - (__v32hf)_mm512_add_ph(__A, __B), - (__v32hf)_mm512_setzero_ph()); -} - -#define _mm512_add_round_ph(A, B, R) \ - ((__m512h)__builtin_ia32_addph512((__v32hf)(__m512h)(A), \ - (__v32hf)(__m512h)(B), (int)(R))) - -#define _mm512_mask_add_round_ph(W, U, A, B, R) \ - ((__m512h)__builtin_ia32_selectph_512( \ - (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)), \ - (__v32hf)(__m512h)(W))) - -#define _mm512_maskz_add_round_ph(U, A, B, R) \ - ((__m512h)__builtin_ia32_selectph_512( \ - (__mmask32)(U), (__v32hf)_mm512_add_round_ph((A), (B), (R)), \ - (__v32hf)_mm512_setzero_ph())) - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_sub_ph(__m512h __A, - __m512h __B) { - return (__m512h)((__v32hf)__A - (__v32hf)__B); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_mask_sub_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { - return (__m512h)__builtin_ia32_selectph_512( - (__mmask32)__U, (__v32hf)_mm512_sub_ph(__A, __B), (__v32hf)__W); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_maskz_sub_ph(__mmask32 __U, __m512h __A, __m512h __B) { - return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, - (__v32hf)_mm512_sub_ph(__A, __B), - (__v32hf)_mm512_setzero_ph()); -} - -#define _mm512_sub_round_ph(A, B, R) \ - ((__m512h)__builtin_ia32_subph512((__v32hf)(__m512h)(A), \ - (__v32hf)(__m512h)(B), (int)(R))) - -#define _mm512_mask_sub_round_ph(W, U, A, B, R) \ - ((__m512h)__builtin_ia32_selectph_512( \ - (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)), \ - (__v32hf)(__m512h)(W))) - -#define _mm512_maskz_sub_round_ph(U, A, B, R) \ - ((__m512h)__builtin_ia32_selectph_512( \ - (__mmask32)(U), (__v32hf)_mm512_sub_round_ph((A), (B), (R)), \ - (__v32hf)_mm512_setzero_ph())) - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_mul_ph(__m512h __A, - __m512h __B) { - return (__m512h)((__v32hf)__A * (__v32hf)__B); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_mask_mul_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { - return (__m512h)__builtin_ia32_selectph_512( - (__mmask32)__U, (__v32hf)_mm512_mul_ph(__A, __B), (__v32hf)__W); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_maskz_mul_ph(__mmask32 __U, __m512h __A, __m512h __B) { - return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, - (__v32hf)_mm512_mul_ph(__A, __B), - (__v32hf)_mm512_setzero_ph()); -} - -#define _mm512_mul_round_ph(A, B, R) \ - ((__m512h)__builtin_ia32_mulph512((__v32hf)(__m512h)(A), \ - (__v32hf)(__m512h)(B), (int)(R))) - -#define _mm512_mask_mul_round_ph(W, U, A, B, R) \ - ((__m512h)__builtin_ia32_selectph_512( \ - (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)), \ - (__v32hf)(__m512h)(W))) - -#define _mm512_maskz_mul_round_ph(U, A, B, R) \ - ((__m512h)__builtin_ia32_selectph_512( \ - (__mmask32)(U), (__v32hf)_mm512_mul_round_ph((A), (B), (R)), \ - (__v32hf)_mm512_setzero_ph())) - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_div_ph(__m512h __A, - __m512h __B) { - return (__m512h)((__v32hf)__A / (__v32hf)__B); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_mask_div_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { - return (__m512h)__builtin_ia32_selectph_512( - (__mmask32)__U, (__v32hf)_mm512_div_ph(__A, __B), (__v32hf)__W); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_maskz_div_ph(__mmask32 __U, __m512h __A, __m512h __B) { - return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, - (__v32hf)_mm512_div_ph(__A, __B), - (__v32hf)_mm512_setzero_ph()); -} - -#define _mm512_div_round_ph(A, B, R) \ - ((__m512h)__builtin_ia32_divph512((__v32hf)(__m512h)(A), \ - (__v32hf)(__m512h)(B), (int)(R))) - -#define _mm512_mask_div_round_ph(W, U, A, B, R) \ - ((__m512h)__builtin_ia32_selectph_512( \ - (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)), \ - (__v32hf)(__m512h)(W))) - -#define _mm512_maskz_div_round_ph(U, A, B, R) \ - ((__m512h)__builtin_ia32_selectph_512( \ - (__mmask32)(U), (__v32hf)_mm512_div_round_ph((A), (B), (R)), \ - (__v32hf)_mm512_setzero_ph())) - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_min_ph(__m512h __A, - __m512h __B) { - return (__m512h)__builtin_ia32_minph512((__v32hf)__A, (__v32hf)__B, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_mask_min_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { - return (__m512h)__builtin_ia32_selectph_512( - (__mmask32)__U, (__v32hf)_mm512_min_ph(__A, __B), (__v32hf)__W); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_maskz_min_ph(__mmask32 __U, __m512h __A, __m512h __B) { - return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, - (__v32hf)_mm512_min_ph(__A, __B), - (__v32hf)_mm512_setzero_ph()); -} - -#define _mm512_min_round_ph(A, B, R) \ - ((__m512h)__builtin_ia32_minph512((__v32hf)(__m512h)(A), \ - (__v32hf)(__m512h)(B), (int)(R))) - -#define _mm512_mask_min_round_ph(W, U, A, B, R) \ - ((__m512h)__builtin_ia32_selectph_512( \ - (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)), \ - (__v32hf)(__m512h)(W))) - -#define _mm512_maskz_min_round_ph(U, A, B, R) \ - ((__m512h)__builtin_ia32_selectph_512( \ - (__mmask32)(U), (__v32hf)_mm512_min_round_ph((A), (B), (R)), \ - (__v32hf)_mm512_setzero_ph())) - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_max_ph(__m512h __A, - __m512h __B) { - return (__m512h)__builtin_ia32_maxph512((__v32hf)__A, (__v32hf)__B, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_mask_max_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { - return (__m512h)__builtin_ia32_selectph_512( - (__mmask32)__U, (__v32hf)_mm512_max_ph(__A, __B), (__v32hf)__W); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_maskz_max_ph(__mmask32 __U, __m512h __A, __m512h __B) { - return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, - (__v32hf)_mm512_max_ph(__A, __B), - (__v32hf)_mm512_setzero_ph()); -} - -#define _mm512_max_round_ph(A, B, R) \ - ((__m512h)__builtin_ia32_maxph512((__v32hf)(__m512h)(A), \ - (__v32hf)(__m512h)(B), (int)(R))) - -#define _mm512_mask_max_round_ph(W, U, A, B, R) \ - ((__m512h)__builtin_ia32_selectph_512( \ - (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)), \ - (__v32hf)(__m512h)(W))) - -#define _mm512_maskz_max_round_ph(U, A, B, R) \ - ((__m512h)__builtin_ia32_selectph_512( \ - (__mmask32)(U), (__v32hf)_mm512_max_round_ph((A), (B), (R)), \ - (__v32hf)_mm512_setzero_ph())) - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_abs_ph(__m512h __A) { - return (__m512h)_mm512_and_epi32(_mm512_set1_epi32(0x7FFF7FFF), (__m512i)__A); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_conj_pch(__m512h __A) { - return (__m512h)_mm512_xor_ps((__m512)__A, _mm512_set1_ps(-0.0f)); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_mask_conj_pch(__m512h __W, __mmask16 __U, __m512h __A) { - return (__m512h)__builtin_ia32_selectps_512( - (__mmask16)__U, (__v16sf)_mm512_conj_pch(__A), (__v16sf)__W); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_maskz_conj_pch(__mmask16 __U, __m512h __A) { - return (__m512h)__builtin_ia32_selectps_512((__mmask16)__U, - (__v16sf)_mm512_conj_pch(__A), - (__v16sf)_mm512_setzero_ps()); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_add_sh(__m128h __A, - __m128h __B) { - __A[0] += __B[0]; - return __A; -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_add_sh(__m128h __W, - __mmask8 __U, - __m128h __A, - __m128h __B) { - __A = _mm_add_sh(__A, __B); - return __builtin_ia32_selectsh_128(__U, __A, __W); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_add_sh(__mmask8 __U, - __m128h __A, - __m128h __B) { - __A = _mm_add_sh(__A, __B); - return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph()); -} - -#define _mm_add_round_sh(A, B, R) \ - ((__m128h)__builtin_ia32_addsh_round_mask( \ - (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ - (__mmask8)-1, (int)(R))) - -#define _mm_mask_add_round_sh(W, U, A, B, R) \ - ((__m128h)__builtin_ia32_addsh_round_mask( \ - (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ - (__mmask8)(U), (int)(R))) - -#define _mm_maskz_add_round_sh(U, A, B, R) \ - ((__m128h)__builtin_ia32_addsh_round_mask( \ - (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ - (__mmask8)(U), (int)(R))) - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sub_sh(__m128h __A, - __m128h __B) { - __A[0] -= __B[0]; - return __A; -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sub_sh(__m128h __W, - __mmask8 __U, - __m128h __A, - __m128h __B) { - __A = _mm_sub_sh(__A, __B); - return __builtin_ia32_selectsh_128(__U, __A, __W); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sub_sh(__mmask8 __U, - __m128h __A, - __m128h __B) { - __A = _mm_sub_sh(__A, __B); - return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph()); -} - -#define _mm_sub_round_sh(A, B, R) \ - ((__m128h)__builtin_ia32_subsh_round_mask( \ - (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ - (__mmask8)-1, (int)(R))) - -#define _mm_mask_sub_round_sh(W, U, A, B, R) \ - ((__m128h)__builtin_ia32_subsh_round_mask( \ - (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ - (__mmask8)(U), (int)(R))) - -#define _mm_maskz_sub_round_sh(U, A, B, R) \ - ((__m128h)__builtin_ia32_subsh_round_mask( \ - (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ - (__mmask8)(U), (int)(R))) - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mul_sh(__m128h __A, - __m128h __B) { - __A[0] *= __B[0]; - return __A; -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_mul_sh(__m128h __W, - __mmask8 __U, - __m128h __A, - __m128h __B) { - __A = _mm_mul_sh(__A, __B); - return __builtin_ia32_selectsh_128(__U, __A, __W); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_mul_sh(__mmask8 __U, - __m128h __A, - __m128h __B) { - __A = _mm_mul_sh(__A, __B); - return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph()); -} - -#define _mm_mul_round_sh(A, B, R) \ - ((__m128h)__builtin_ia32_mulsh_round_mask( \ - (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ - (__mmask8)-1, (int)(R))) - -#define _mm_mask_mul_round_sh(W, U, A, B, R) \ - ((__m128h)__builtin_ia32_mulsh_round_mask( \ - (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ - (__mmask8)(U), (int)(R))) - -#define _mm_maskz_mul_round_sh(U, A, B, R) \ - ((__m128h)__builtin_ia32_mulsh_round_mask( \ - (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ - (__mmask8)(U), (int)(R))) - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_div_sh(__m128h __A, - __m128h __B) { - __A[0] /= __B[0]; - return __A; -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_div_sh(__m128h __W, - __mmask8 __U, - __m128h __A, - __m128h __B) { - __A = _mm_div_sh(__A, __B); - return __builtin_ia32_selectsh_128(__U, __A, __W); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_div_sh(__mmask8 __U, - __m128h __A, - __m128h __B) { - __A = _mm_div_sh(__A, __B); - return __builtin_ia32_selectsh_128(__U, __A, _mm_setzero_ph()); -} - -#define _mm_div_round_sh(A, B, R) \ - ((__m128h)__builtin_ia32_divsh_round_mask( \ - (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ - (__mmask8)-1, (int)(R))) - -#define _mm_mask_div_round_sh(W, U, A, B, R) \ - ((__m128h)__builtin_ia32_divsh_round_mask( \ - (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ - (__mmask8)(U), (int)(R))) - -#define _mm_maskz_div_round_sh(U, A, B, R) \ - ((__m128h)__builtin_ia32_divsh_round_mask( \ - (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ - (__mmask8)(U), (int)(R))) - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_min_sh(__m128h __A, - __m128h __B) { - return (__m128h)__builtin_ia32_minsh_round_mask( - (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_min_sh(__m128h __W, - __mmask8 __U, - __m128h __A, - __m128h __B) { - return (__m128h)__builtin_ia32_minsh_round_mask((__v8hf)__A, (__v8hf)__B, - (__v8hf)__W, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_min_sh(__mmask8 __U, - __m128h __A, - __m128h __B) { - return (__m128h)__builtin_ia32_minsh_round_mask( - (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_min_round_sh(A, B, R) \ - ((__m128h)__builtin_ia32_minsh_round_mask( \ - (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ - (__mmask8)-1, (int)(R))) - -#define _mm_mask_min_round_sh(W, U, A, B, R) \ - ((__m128h)__builtin_ia32_minsh_round_mask( \ - (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ - (__mmask8)(U), (int)(R))) - -#define _mm_maskz_min_round_sh(U, A, B, R) \ - ((__m128h)__builtin_ia32_minsh_round_mask( \ - (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ - (__mmask8)(U), (int)(R))) - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_max_sh(__m128h __A, - __m128h __B) { - return (__m128h)__builtin_ia32_maxsh_round_mask( - (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_max_sh(__m128h __W, - __mmask8 __U, - __m128h __A, - __m128h __B) { - return (__m128h)__builtin_ia32_maxsh_round_mask((__v8hf)__A, (__v8hf)__B, - (__v8hf)__W, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_max_sh(__mmask8 __U, - __m128h __A, - __m128h __B) { - return (__m128h)__builtin_ia32_maxsh_round_mask( - (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_max_round_sh(A, B, R) \ - ((__m128h)__builtin_ia32_maxsh_round_mask( \ - (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ - (__mmask8)-1, (int)(R))) - -#define _mm_mask_max_round_sh(W, U, A, B, R) \ - ((__m128h)__builtin_ia32_maxsh_round_mask( \ - (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ - (__mmask8)(U), (int)(R))) - -#define _mm_maskz_max_round_sh(U, A, B, R) \ - ((__m128h)__builtin_ia32_maxsh_round_mask( \ - (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ - (__mmask8)(U), (int)(R))) - -#define _mm512_cmp_round_ph_mask(A, B, P, R) \ - ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A), \ - (__v32hf)(__m512h)(B), (int)(P), \ - (__mmask32)-1, (int)(R))) - -#define _mm512_mask_cmp_round_ph_mask(U, A, B, P, R) \ - ((__mmask32)__builtin_ia32_cmpph512_mask((__v32hf)(__m512h)(A), \ - (__v32hf)(__m512h)(B), (int)(P), \ - (__mmask32)(U), (int)(R))) - -#define _mm512_cmp_ph_mask(A, B, P) \ - _mm512_cmp_round_ph_mask((A), (B), (P), _MM_FROUND_CUR_DIRECTION) - -#define _mm512_mask_cmp_ph_mask(U, A, B, P) \ - _mm512_mask_cmp_round_ph_mask((U), (A), (B), (P), _MM_FROUND_CUR_DIRECTION) - -#define _mm_cmp_round_sh_mask(X, Y, P, R) \ - ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X), \ - (__v8hf)(__m128h)(Y), (int)(P), \ - (__mmask8)-1, (int)(R))) - -#define _mm_mask_cmp_round_sh_mask(M, X, Y, P, R) \ - ((__mmask8)__builtin_ia32_cmpsh_mask((__v8hf)(__m128h)(X), \ - (__v8hf)(__m128h)(Y), (int)(P), \ - (__mmask8)(M), (int)(R))) - -#define _mm_cmp_sh_mask(X, Y, P) \ - ((__mmask8)__builtin_ia32_cmpsh_mask( \ - (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)-1, \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm_mask_cmp_sh_mask(M, X, Y, P) \ - ((__mmask8)__builtin_ia32_cmpsh_mask( \ - (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), (int)(P), (__mmask8)(M), \ - _MM_FROUND_CUR_DIRECTION)) -// loads with vmovsh: -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_load_sh(void const *__dp) { - struct __mm_load_sh_struct { - _Float16 __u; - } __attribute__((__packed__, __may_alias__)); - _Float16 __u = ((struct __mm_load_sh_struct *)__dp)->__u; - return (__m128h){__u, 0, 0, 0, 0, 0, 0, 0}; -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_mask_load_sh(__m128h __W, __mmask8 __U, const void *__A) { - __m128h src = (__v8hf)__builtin_shufflevector( - (__v8hf)__W, (__v8hf)_mm_setzero_ph(), 0, 8, 8, 8, 8, 8, 8, 8); - - return (__m128h)__builtin_ia32_loadsh128_mask((__v8hf *)__A, src, __U & 1); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_maskz_load_sh(__mmask8 __U, const void *__A) { - return (__m128h)__builtin_ia32_loadsh128_mask( - (__v8hf *)__A, (__v8hf)_mm_setzero_ph(), __U & 1); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_load_ph(void const *__p) { - return *(const __m512h *)__p; -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_load_ph(void const *__p) { - return *(const __m256h *)__p; -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_load_ph(void const *__p) { - return *(const __m128h *)__p; -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_loadu_ph(void const *__p) { - struct __loadu_ph { - __m512h_u __v; - } __attribute__((__packed__, __may_alias__)); - return ((const struct __loadu_ph *)__p)->__v; -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_loadu_ph(void const *__p) { - struct __loadu_ph { - __m256h_u __v; - } __attribute__((__packed__, __may_alias__)); - return ((const struct __loadu_ph *)__p)->__v; -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_loadu_ph(void const *__p) { - struct __loadu_ph { - __m128h_u __v; - } __attribute__((__packed__, __may_alias__)); - return ((const struct __loadu_ph *)__p)->__v; -} - -// stores with vmovsh: -static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_sh(void *__dp, - __m128h __a) { - struct __mm_store_sh_struct { - _Float16 __u; - } __attribute__((__packed__, __may_alias__)); - ((struct __mm_store_sh_struct *)__dp)->__u = __a[0]; -} - -static __inline__ void __DEFAULT_FN_ATTRS128 _mm_mask_store_sh(void *__W, - __mmask8 __U, - __m128h __A) { - __builtin_ia32_storesh128_mask((__v8hf *)__W, __A, __U & 1); -} - -static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_store_ph(void *__P, - __m512h __A) { - *(__m512h *)__P = __A; -} - -static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_store_ph(void *__P, - __m256h __A) { - *(__m256h *)__P = __A; -} - -static __inline__ void __DEFAULT_FN_ATTRS128 _mm_store_ph(void *__P, - __m128h __A) { - *(__m128h *)__P = __A; -} - -static __inline__ void __DEFAULT_FN_ATTRS512 _mm512_storeu_ph(void *__P, - __m512h __A) { - struct __storeu_ph { - __m512h_u __v; - } __attribute__((__packed__, __may_alias__)); - ((struct __storeu_ph *)__P)->__v = __A; -} - -static __inline__ void __DEFAULT_FN_ATTRS256 _mm256_storeu_ph(void *__P, - __m256h __A) { - struct __storeu_ph { - __m256h_u __v; - } __attribute__((__packed__, __may_alias__)); - ((struct __storeu_ph *)__P)->__v = __A; -} - -static __inline__ void __DEFAULT_FN_ATTRS128 _mm_storeu_ph(void *__P, - __m128h __A) { - struct __storeu_ph { - __m128h_u __v; - } __attribute__((__packed__, __may_alias__)); - ((struct __storeu_ph *)__P)->__v = __A; -} - -// moves with vmovsh: -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_move_sh(__m128h __a, - __m128h __b) { - __a[0] = __b[0]; - return __a; -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_move_sh(__m128h __W, - __mmask8 __U, - __m128h __A, - __m128h __B) { - return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B), __W); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_move_sh(__mmask8 __U, - __m128h __A, - __m128h __B) { - return __builtin_ia32_selectsh_128(__U, _mm_move_sh(__A, __B), - _mm_setzero_ph()); -} - -// vmovw: -static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtsi16_si128(short __a) { - return (__m128i)(__v8hi){__a, 0, 0, 0, 0, 0, 0, 0}; -} - -static __inline__ short __DEFAULT_FN_ATTRS128 _mm_cvtsi128_si16(__m128i __a) { - __v8hi __b = (__v8hi)__a; - return __b[0]; -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_rcp_ph(__m512h __A) { - return (__m512h)__builtin_ia32_rcpph512_mask( - (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_mask_rcp_ph(__m512h __W, __mmask32 __U, __m512h __A) { - return (__m512h)__builtin_ia32_rcpph512_mask((__v32hf)__A, (__v32hf)__W, - (__mmask32)__U); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_maskz_rcp_ph(__mmask32 __U, __m512h __A) { - return (__m512h)__builtin_ia32_rcpph512_mask( - (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_rsqrt_ph(__m512h __A) { - return (__m512h)__builtin_ia32_rsqrtph512_mask( - (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_mask_rsqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) { - return (__m512h)__builtin_ia32_rsqrtph512_mask((__v32hf)__A, (__v32hf)__W, - (__mmask32)__U); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_maskz_rsqrt_ph(__mmask32 __U, __m512h __A) { - return (__m512h)__builtin_ia32_rsqrtph512_mask( - (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U); -} - -#define _mm512_getmant_ph(A, B, C) \ - ((__m512h)__builtin_ia32_getmantph512_mask( \ - (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \ - (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_mask_getmant_ph(W, U, A, B, C) \ - ((__m512h)__builtin_ia32_getmantph512_mask( \ - (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W), \ - (__mmask32)(U), _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_maskz_getmant_ph(U, A, B, C) \ - ((__m512h)__builtin_ia32_getmantph512_mask( \ - (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \ - (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_getmant_round_ph(A, B, C, R) \ - ((__m512h)__builtin_ia32_getmantph512_mask( \ - (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \ - (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R))) - -#define _mm512_mask_getmant_round_ph(W, U, A, B, C, R) \ - ((__m512h)__builtin_ia32_getmantph512_mask( \ - (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), (__v32hf)(__m512h)(W), \ - (__mmask32)(U), (int)(R))) - -#define _mm512_maskz_getmant_round_ph(U, A, B, C, R) \ - ((__m512h)__builtin_ia32_getmantph512_mask( \ - (__v32hf)(__m512h)(A), (int)(((C) << 2) | (B)), \ - (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R))) - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_getexp_ph(__m512h __A) { - return (__m512h)__builtin_ia32_getexpph512_mask( - (__v32hf)__A, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_mask_getexp_ph(__m512h __W, __mmask32 __U, __m512h __A) { - return (__m512h)__builtin_ia32_getexpph512_mask( - (__v32hf)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_maskz_getexp_ph(__mmask32 __U, __m512h __A) { - return (__m512h)__builtin_ia32_getexpph512_mask( - (__v32hf)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_getexp_round_ph(A, R) \ - ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \ - (__v32hf)_mm512_undefined_ph(), \ - (__mmask32)-1, (int)(R))) - -#define _mm512_mask_getexp_round_ph(W, U, A, R) \ - ((__m512h)__builtin_ia32_getexpph512_mask( \ - (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(W), (__mmask32)(U), (int)(R))) - -#define _mm512_maskz_getexp_round_ph(U, A, R) \ - ((__m512h)__builtin_ia32_getexpph512_mask((__v32hf)(__m512h)(A), \ - (__v32hf)_mm512_setzero_ph(), \ - (__mmask32)(U), (int)(R))) - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_scalef_ph(__m512h __A, - __m512h __B) { - return (__m512h)__builtin_ia32_scalefph512_mask( - (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_mask_scalef_ph(__m512h __W, __mmask32 __U, __m512h __A, __m512h __B) { - return (__m512h)__builtin_ia32_scalefph512_mask((__v32hf)__A, (__v32hf)__B, - (__v32hf)__W, (__mmask32)__U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_maskz_scalef_ph(__mmask32 __U, __m512h __A, __m512h __B) { - return (__m512h)__builtin_ia32_scalefph512_mask( - (__v32hf)__A, (__v32hf)__B, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_scalef_round_ph(A, B, R) \ - ((__m512h)__builtin_ia32_scalefph512_mask( \ - (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), \ - (__v32hf)_mm512_undefined_ph(), (__mmask32)-1, (int)(R))) - -#define _mm512_mask_scalef_round_ph(W, U, A, B, R) \ - ((__m512h)__builtin_ia32_scalefph512_mask( \ - (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(W), \ - (__mmask32)(U), (int)(R))) - -#define _mm512_maskz_scalef_round_ph(U, A, B, R) \ - ((__m512h)__builtin_ia32_scalefph512_mask( \ - (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), \ - (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R))) - -#define _mm512_roundscale_ph(A, B) \ - ((__m512h)__builtin_ia32_rndscaleph_mask( \ - (__v32hf)(__m512h)(A), (int)(B), (__v32hf)(__m512h)(A), (__mmask32)-1, \ - _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_mask_roundscale_ph(A, B, C, imm) \ - ((__m512h)__builtin_ia32_rndscaleph_mask( \ - (__v32hf)(__m512h)(C), (int)(imm), (__v32hf)(__m512h)(A), \ - (__mmask32)(B), _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_maskz_roundscale_ph(A, B, imm) \ - ((__m512h)__builtin_ia32_rndscaleph_mask( \ - (__v32hf)(__m512h)(B), (int)(imm), (__v32hf)_mm512_setzero_ph(), \ - (__mmask32)(A), _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_mask_roundscale_round_ph(A, B, C, imm, R) \ - ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(C), (int)(imm), \ - (__v32hf)(__m512h)(A), \ - (__mmask32)(B), (int)(R))) - -#define _mm512_maskz_roundscale_round_ph(A, B, imm, R) \ - ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(B), (int)(imm), \ - (__v32hf)_mm512_setzero_ph(), \ - (__mmask32)(A), (int)(R))) - -#define _mm512_roundscale_round_ph(A, imm, R) \ - ((__m512h)__builtin_ia32_rndscaleph_mask((__v32hf)(__m512h)(A), (int)(imm), \ - (__v32hf)_mm512_undefined_ph(), \ - (__mmask32)-1, (int)(R))) - -#define _mm512_reduce_ph(A, imm) \ - ((__m512h)__builtin_ia32_reduceph512_mask( \ - (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_undefined_ph(), \ - (__mmask32)-1, _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_mask_reduce_ph(W, U, A, imm) \ - ((__m512h)__builtin_ia32_reduceph512_mask( \ - (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)(__m512h)(W), \ - (__mmask32)(U), _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_maskz_reduce_ph(U, A, imm) \ - ((__m512h)__builtin_ia32_reduceph512_mask( \ - (__v32hf)(__m512h)(A), (int)(imm), (__v32hf)_mm512_setzero_ph(), \ - (__mmask32)(U), _MM_FROUND_CUR_DIRECTION)) - -#define _mm512_mask_reduce_round_ph(W, U, A, imm, R) \ - ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \ - (__v32hf)(__m512h)(W), \ - (__mmask32)(U), (int)(R))) - -#define _mm512_maskz_reduce_round_ph(U, A, imm, R) \ - ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \ - (__v32hf)_mm512_setzero_ph(), \ - (__mmask32)(U), (int)(R))) - -#define _mm512_reduce_round_ph(A, imm, R) \ - ((__m512h)__builtin_ia32_reduceph512_mask((__v32hf)(__m512h)(A), (int)(imm), \ - (__v32hf)_mm512_undefined_ph(), \ - (__mmask32)-1, (int)(R))) - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rcp_sh(__m128h __A, - __m128h __B) { - return (__m128h)__builtin_ia32_rcpsh_mask( - (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rcp_sh(__m128h __W, - __mmask8 __U, - __m128h __A, - __m128h __B) { - return (__m128h)__builtin_ia32_rcpsh_mask((__v8hf)__A, (__v8hf)__B, - (__v8hf)__W, (__mmask8)__U); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_rcp_sh(__mmask8 __U, - __m128h __A, - __m128h __B) { - return (__m128h)__builtin_ia32_rcpsh_mask( - (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rsqrt_sh(__m128h __A, - __m128h __B) { - return (__m128h)__builtin_ia32_rsqrtsh_mask( - (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rsqrt_sh(__m128h __W, - __mmask8 __U, - __m128h __A, - __m128h __B) { - return (__m128h)__builtin_ia32_rsqrtsh_mask((__v8hf)__A, (__v8hf)__B, - (__v8hf)__W, (__mmask8)__U); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_maskz_rsqrt_sh(__mmask8 __U, __m128h __A, __m128h __B) { - return (__m128h)__builtin_ia32_rsqrtsh_mask( - (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); -} - -#define _mm_getmant_round_sh(A, B, C, D, R) \ - ((__m128h)__builtin_ia32_getmantsh_round_mask( \ - (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \ - (__v8hf)_mm_setzero_ph(), (__mmask8)-1, (int)(R))) - -#define _mm_getmant_sh(A, B, C, D) \ - ((__m128h)__builtin_ia32_getmantsh_round_mask( \ - (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \ - (__v8hf)_mm_setzero_ph(), (__mmask8)-1, _MM_FROUND_CUR_DIRECTION)) - -#define _mm_mask_getmant_sh(W, U, A, B, C, D) \ - ((__m128h)__builtin_ia32_getmantsh_round_mask( \ - (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \ - (__v8hf)(__m128h)(W), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) - -#define _mm_mask_getmant_round_sh(W, U, A, B, C, D, R) \ - ((__m128h)__builtin_ia32_getmantsh_round_mask( \ - (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \ - (__v8hf)(__m128h)(W), (__mmask8)(U), (int)(R))) - -#define _mm_maskz_getmant_sh(U, A, B, C, D) \ - ((__m128h)__builtin_ia32_getmantsh_round_mask( \ - (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \ - (__v8hf)_mm_setzero_ph(), (__mmask8)(U), _MM_FROUND_CUR_DIRECTION)) - -#define _mm_maskz_getmant_round_sh(U, A, B, C, D, R) \ - ((__m128h)__builtin_ia32_getmantsh_round_mask( \ - (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (int)(((D) << 2) | (C)), \ - (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R))) - -#define _mm_getexp_round_sh(A, B, R) \ - ((__m128h)__builtin_ia32_getexpsh128_round_mask( \ - (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ - (__mmask8)-1, (int)(R))) - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_getexp_sh(__m128h __A, - __m128h __B) { - return (__m128h)__builtin_ia32_getexpsh128_round_mask( - (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_mask_getexp_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { - return (__m128h)__builtin_ia32_getexpsh128_round_mask( - (__v8hf)__A, (__v8hf)__B, (__v8hf)__W, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_mask_getexp_round_sh(W, U, A, B, R) \ - ((__m128h)__builtin_ia32_getexpsh128_round_mask( \ - (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ - (__mmask8)(U), (int)(R))) - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_maskz_getexp_sh(__mmask8 __U, __m128h __A, __m128h __B) { - return (__m128h)__builtin_ia32_getexpsh128_round_mask( - (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_maskz_getexp_round_sh(U, A, B, R) \ - ((__m128h)__builtin_ia32_getexpsh128_round_mask( \ - (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ - (__mmask8)(U), (int)(R))) - -#define _mm_scalef_round_sh(A, B, R) \ - ((__m128h)__builtin_ia32_scalefsh_round_mask( \ - (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ - (__mmask8)-1, (int)(R))) - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_scalef_sh(__m128h __A, - __m128h __B) { - return (__m128h)__builtin_ia32_scalefsh_round_mask( - (__v8hf)__A, (__v8hf)(__B), (__v8hf)_mm_setzero_ph(), (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_mask_scalef_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { - return (__m128h)__builtin_ia32_scalefsh_round_mask((__v8hf)__A, (__v8hf)__B, - (__v8hf)__W, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_mask_scalef_round_sh(W, U, A, B, R) \ - ((__m128h)__builtin_ia32_scalefsh_round_mask( \ - (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ - (__mmask8)(U), (int)(R))) - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_maskz_scalef_sh(__mmask8 __U, __m128h __A, __m128h __B) { - return (__m128h)__builtin_ia32_scalefsh_round_mask( - (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_maskz_scalef_round_sh(U, A, B, R) \ - ((__m128h)__builtin_ia32_scalefsh_round_mask( \ - (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ - (__mmask8)(U), (int)(R))) - -#define _mm_roundscale_round_sh(A, B, imm, R) \ - ((__m128h)__builtin_ia32_rndscalesh_round_mask( \ - (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ - (__mmask8)-1, (int)(imm), (int)(R))) - -#define _mm_roundscale_sh(A, B, imm) \ - ((__m128h)__builtin_ia32_rndscalesh_round_mask( \ - (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ - (__mmask8)-1, (int)(imm), _MM_FROUND_CUR_DIRECTION)) - -#define _mm_mask_roundscale_sh(W, U, A, B, I) \ - ((__m128h)__builtin_ia32_rndscalesh_round_mask( \ - (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ - (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION)) - -#define _mm_mask_roundscale_round_sh(W, U, A, B, I, R) \ - ((__m128h)__builtin_ia32_rndscalesh_round_mask( \ - (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ - (__mmask8)(U), (int)(I), (int)(R))) - -#define _mm_maskz_roundscale_sh(U, A, B, I) \ - ((__m128h)__builtin_ia32_rndscalesh_round_mask( \ - (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ - (__mmask8)(U), (int)(I), _MM_FROUND_CUR_DIRECTION)) - -#define _mm_maskz_roundscale_round_sh(U, A, B, I, R) \ - ((__m128h)__builtin_ia32_rndscalesh_round_mask( \ - (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ - (__mmask8)(U), (int)(I), (int)(R))) - -#define _mm_reduce_sh(A, B, C) \ - ((__m128h)__builtin_ia32_reducesh_mask( \ - (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ - (__mmask8)-1, (int)(C), _MM_FROUND_CUR_DIRECTION)) - -#define _mm_mask_reduce_sh(W, U, A, B, C) \ - ((__m128h)__builtin_ia32_reducesh_mask( \ - (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ - (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION)) - -#define _mm_maskz_reduce_sh(U, A, B, C) \ - ((__m128h)__builtin_ia32_reducesh_mask( \ - (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ - (__mmask8)(U), (int)(C), _MM_FROUND_CUR_DIRECTION)) - -#define _mm_reduce_round_sh(A, B, C, R) \ - ((__m128h)__builtin_ia32_reducesh_mask( \ - (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ - (__mmask8)-1, (int)(C), (int)(R))) - -#define _mm_mask_reduce_round_sh(W, U, A, B, C, R) \ - ((__m128h)__builtin_ia32_reducesh_mask( \ - (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ - (__mmask8)(U), (int)(C), (int)(R))) - -#define _mm_maskz_reduce_round_sh(U, A, B, C, R) \ - ((__m128h)__builtin_ia32_reducesh_mask( \ - (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ - (__mmask8)(U), (int)(C), (int)(R))) - -#define _mm512_sqrt_round_ph(A, R) \ - ((__m512h)__builtin_ia32_sqrtph512((__v32hf)(__m512h)(A), (int)(R))) - -#define _mm512_mask_sqrt_round_ph(W, U, A, R) \ - ((__m512h)__builtin_ia32_selectph_512( \ - (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)), \ - (__v32hf)(__m512h)(W))) - -#define _mm512_maskz_sqrt_round_ph(U, A, R) \ - ((__m512h)__builtin_ia32_selectph_512( \ - (__mmask32)(U), (__v32hf)_mm512_sqrt_round_ph((A), (R)), \ - (__v32hf)_mm512_setzero_ph())) - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_sqrt_ph(__m512h __A) { - return (__m512h)__builtin_ia32_sqrtph512((__v32hf)__A, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_mask_sqrt_ph(__m512h __W, __mmask32 __U, __m512h __A) { - return (__m512h)__builtin_ia32_selectph_512( - (__mmask32)(__U), - (__v32hf)__builtin_ia32_sqrtph512((__A), (_MM_FROUND_CUR_DIRECTION)), - (__v32hf)(__m512h)(__W)); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_maskz_sqrt_ph(__mmask32 __U, __m512h __A) { - return (__m512h)__builtin_ia32_selectph_512( - (__mmask32)(__U), - (__v32hf)__builtin_ia32_sqrtph512((__A), (_MM_FROUND_CUR_DIRECTION)), - (__v32hf)_mm512_setzero_ph()); -} - -#define _mm_sqrt_round_sh(A, B, R) \ - ((__m128h)__builtin_ia32_sqrtsh_round_mask( \ - (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ - (__mmask8)-1, (int)(R))) - -#define _mm_mask_sqrt_round_sh(W, U, A, B, R) \ - ((__m128h)__builtin_ia32_sqrtsh_round_mask( \ - (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(W), \ - (__mmask8)(U), (int)(R))) - -#define _mm_maskz_sqrt_round_sh(U, A, B, R) \ - ((__m128h)__builtin_ia32_sqrtsh_round_mask( \ - (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)_mm_setzero_ph(), \ - (__mmask8)(U), (int)(R))) - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sqrt_sh(__m128h __A, - __m128h __B) { - return (__m128h)__builtin_ia32_sqrtsh_round_mask( - (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(), - (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_sh(__m128h __W, - __mmask32 __U, - __m128h __A, - __m128h __B) { - return (__m128h)__builtin_ia32_sqrtsh_round_mask( - (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)(__m128h)(__W), - (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_sh(__mmask32 __U, - __m128h __A, - __m128h __B) { - return (__m128h)__builtin_ia32_sqrtsh_round_mask( - (__v8hf)(__m128h)(__A), (__v8hf)(__m128h)(__B), (__v8hf)_mm_setzero_ph(), - (__mmask8)(__U), _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_mask_fpclass_ph_mask(U, A, imm) \ - ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A), \ - (int)(imm), (__mmask32)(U))) - -#define _mm512_fpclass_ph_mask(A, imm) \ - ((__mmask32)__builtin_ia32_fpclassph512_mask((__v32hf)(__m512h)(A), \ - (int)(imm), (__mmask32)-1)) - -#define _mm_fpclass_sh_mask(A, imm) \ - ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm), \ - (__mmask8)-1)) - -#define _mm_mask_fpclass_sh_mask(U, A, imm) \ - ((__mmask8)__builtin_ia32_fpclasssh_mask((__v8hf)(__m128h)(A), (int)(imm), \ - (__mmask8)(U))) - -#define _mm512_cvt_roundpd_ph(A, R) \ - ((__m128h)__builtin_ia32_vcvtpd2ph512_mask( \ - (__v8df)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R))) - -#define _mm512_mask_cvt_roundpd_ph(W, U, A, R) \ - ((__m128h)__builtin_ia32_vcvtpd2ph512_mask((__v8df)(A), (__v8hf)(W), \ - (__mmask8)(U), (int)(R))) - -#define _mm512_maskz_cvt_roundpd_ph(U, A, R) \ - ((__m128h)__builtin_ia32_vcvtpd2ph512_mask( \ - (__v8df)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R))) - -static __inline__ __m128h __DEFAULT_FN_ATTRS512 _mm512_cvtpd_ph(__m512d __A) { - return (__m128h)__builtin_ia32_vcvtpd2ph512_mask( - (__v8df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtpd_ph(__m128h __W, __mmask8 __U, __m512d __A) { - return (__m128h)__builtin_ia32_vcvtpd2ph512_mask( - (__v8df)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtpd_ph(__mmask8 __U, __m512d __A) { - return (__m128h)__builtin_ia32_vcvtpd2ph512_mask( - (__v8df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_cvt_roundph_pd(A, R) \ - ((__m512d)__builtin_ia32_vcvtph2pd512_mask( \ - (__v8hf)(A), (__v8df)_mm512_undefined_pd(), (__mmask8)(-1), (int)(R))) - -#define _mm512_mask_cvt_roundph_pd(W, U, A, R) \ - ((__m512d)__builtin_ia32_vcvtph2pd512_mask((__v8hf)(A), (__v8df)(W), \ - (__mmask8)(U), (int)(R))) - -#define _mm512_maskz_cvt_roundph_pd(U, A, R) \ - ((__m512d)__builtin_ia32_vcvtph2pd512_mask( \ - (__v8hf)(A), (__v8df)_mm512_setzero_pd(), (__mmask8)(U), (int)(R))) - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 _mm512_cvtph_pd(__m128h __A) { - return (__m512d)__builtin_ia32_vcvtph2pd512_mask( - (__v8hf)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtph_pd(__m512d __W, __mmask8 __U, __m128h __A) { - return (__m512d)__builtin_ia32_vcvtph2pd512_mask( - (__v8hf)__A, (__v8df)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512d __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtph_pd(__mmask8 __U, __m128h __A) { - return (__m512d)__builtin_ia32_vcvtph2pd512_mask( - (__v8hf)__A, (__v8df)_mm512_setzero_pd(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_cvt_roundsh_ss(A, B, R) \ - ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B), \ - (__v4sf)_mm_undefined_ps(), \ - (__mmask8)(-1), (int)(R))) - -#define _mm_mask_cvt_roundsh_ss(W, U, A, B, R) \ - ((__m128)__builtin_ia32_vcvtsh2ss_round_mask( \ - (__v4sf)(A), (__v8hf)(B), (__v4sf)(W), (__mmask8)(U), (int)(R))) - -#define _mm_maskz_cvt_roundsh_ss(U, A, B, R) \ - ((__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)(A), (__v8hf)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U), (int)(R))) - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtsh_ss(__m128 __A, - __m128h __B) { - return (__m128)__builtin_ia32_vcvtsh2ss_round_mask( - (__v4sf)__A, (__v8hf)__B, (__v4sf)_mm_undefined_ps(), (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_cvtsh_ss(__m128 __W, - __mmask8 __U, - __m128 __A, - __m128h __B) { - return (__m128)__builtin_ia32_vcvtsh2ss_round_mask((__v4sf)__A, (__v8hf)__B, - (__v4sf)__W, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_maskz_cvtsh_ss(__mmask8 __U, - __m128 __A, - __m128h __B) { - return (__m128)__builtin_ia32_vcvtsh2ss_round_mask( - (__v4sf)__A, (__v8hf)__B, (__v4sf)_mm_setzero_ps(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_cvt_roundss_sh(A, B, R) \ - ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B), \ - (__v8hf)_mm_undefined_ph(), \ - (__mmask8)(-1), (int)(R))) - -#define _mm_mask_cvt_roundss_sh(W, U, A, B, R) \ - ((__m128h)__builtin_ia32_vcvtss2sh_round_mask( \ - (__v8hf)(A), (__v4sf)(B), (__v8hf)(W), (__mmask8)(U), (int)(R))) - -#define _mm_maskz_cvt_roundss_sh(U, A, B, R) \ - ((__m128h)__builtin_ia32_vcvtss2sh_round_mask((__v8hf)(A), (__v4sf)(B), \ - (__v8hf)_mm_setzero_ph(), \ - (__mmask8)(U), (int)(R))) - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtss_sh(__m128h __A, - __m128 __B) { - return (__m128h)__builtin_ia32_vcvtss2sh_round_mask( - (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_undefined_ph(), (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtss_sh(__m128h __W, - __mmask8 __U, - __m128h __A, - __m128 __B) { - return (__m128h)__builtin_ia32_vcvtss2sh_round_mask( - (__v8hf)__A, (__v4sf)__B, (__v8hf)__W, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_cvtss_sh(__mmask8 __U, - __m128h __A, - __m128 __B) { - return (__m128h)__builtin_ia32_vcvtss2sh_round_mask( - (__v8hf)__A, (__v4sf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_cvt_roundsd_sh(A, B, R) \ - ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B), \ - (__v8hf)_mm_undefined_ph(), \ - (__mmask8)(-1), (int)(R))) - -#define _mm_mask_cvt_roundsd_sh(W, U, A, B, R) \ - ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask( \ - (__v8hf)(A), (__v2df)(B), (__v8hf)(W), (__mmask8)(U), (int)(R))) - -#define _mm_maskz_cvt_roundsd_sh(U, A, B, R) \ - ((__m128h)__builtin_ia32_vcvtsd2sh_round_mask((__v8hf)(A), (__v2df)(B), \ - (__v8hf)_mm_setzero_ph(), \ - (__mmask8)(U), (int)(R))) - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtsd_sh(__m128h __A, - __m128d __B) { - return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask( - (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_undefined_ph(), (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtsd_sh(__m128h __W, - __mmask8 __U, - __m128h __A, - __m128d __B) { - return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask( - (__v8hf)__A, (__v2df)__B, (__v8hf)__W, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtsd_sh(__mmask8 __U, __m128h __A, __m128d __B) { - return (__m128h)__builtin_ia32_vcvtsd2sh_round_mask( - (__v8hf)__A, (__v2df)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_cvt_roundsh_sd(A, B, R) \ - ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B), \ - (__v2df)_mm_undefined_pd(), \ - (__mmask8)(-1), (int)(R))) - -#define _mm_mask_cvt_roundsh_sd(W, U, A, B, R) \ - ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask( \ - (__v2df)(A), (__v8hf)(B), (__v2df)(W), (__mmask8)(U), (int)(R))) - -#define _mm_maskz_cvt_roundsh_sd(U, A, B, R) \ - ((__m128d)__builtin_ia32_vcvtsh2sd_round_mask((__v2df)(A), (__v8hf)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U), (int)(R))) - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_cvtsh_sd(__m128d __A, - __m128h __B) { - return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask( - (__v2df)__A, (__v8hf)__B, (__v2df)_mm_undefined_pd(), (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_cvtsh_sd(__m128d __W, - __mmask8 __U, - __m128d __A, - __m128h __B) { - return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask( - (__v2df)__A, (__v8hf)__B, (__v2df)__W, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtsh_sd(__mmask8 __U, __m128d __A, __m128h __B) { - return (__m128d)__builtin_ia32_vcvtsh2sd_round_mask( - (__v2df)__A, (__v8hf)__B, (__v2df)_mm_setzero_pd(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_cvt_roundph_epi16(A, R) \ - ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), \ - (__v32hi)_mm512_undefined_epi32(), \ - (__mmask32)(-1), (int)(R))) - -#define _mm512_mask_cvt_roundph_epi16(W, U, A, R) \ - ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), (__v32hi)(W), \ - (__mmask32)(U), (int)(R))) - -#define _mm512_maskz_cvt_roundph_epi16(U, A, R) \ - ((__m512i)__builtin_ia32_vcvtph2w512_mask((__v32hf)(A), \ - (__v32hi)_mm512_setzero_epi32(), \ - (__mmask32)(U), (int)(R))) - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_cvtph_epi16(__m512h __A) { - return (__m512i)__builtin_ia32_vcvtph2w512_mask( - (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)-1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtph_epi16(__m512i __W, __mmask32 __U, __m512h __A) { - return (__m512i)__builtin_ia32_vcvtph2w512_mask( - (__v32hf)__A, (__v32hi)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtph_epi16(__mmask32 __U, __m512h __A) { - return (__m512i)__builtin_ia32_vcvtph2w512_mask( - (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_cvtt_roundph_epi16(A, R) \ - ((__m512i)__builtin_ia32_vcvttph2w512_mask( \ - (__v32hf)(A), (__v32hi)_mm512_undefined_epi32(), (__mmask32)(-1), \ - (int)(R))) - -#define _mm512_mask_cvtt_roundph_epi16(W, U, A, R) \ - ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A), (__v32hi)(W), \ - (__mmask32)(U), (int)(R))) - -#define _mm512_maskz_cvtt_roundph_epi16(U, A, R) \ - ((__m512i)__builtin_ia32_vcvttph2w512_mask((__v32hf)(A), \ - (__v32hi)_mm512_setzero_epi32(), \ - (__mmask32)(U), (int)(R))) - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_cvttph_epi16(__m512h __A) { - return (__m512i)__builtin_ia32_vcvttph2w512_mask( - (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)-1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvttph_epi16(__m512i __W, __mmask32 __U, __m512h __A) { - return (__m512i)__builtin_ia32_vcvttph2w512_mask( - (__v32hf)__A, (__v32hi)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvttph_epi16(__mmask32 __U, __m512h __A) { - return (__m512i)__builtin_ia32_vcvttph2w512_mask( - (__v32hf)__A, (__v32hi)_mm512_setzero_epi32(), (__mmask32)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_cvt_roundepi16_ph(A, R) \ - ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A), \ - (__v32hf)_mm512_undefined_ph(), \ - (__mmask32)(-1), (int)(R))) - -#define _mm512_mask_cvt_roundepi16_ph(W, U, A, R) \ - ((__m512h)__builtin_ia32_vcvtw2ph512_mask((__v32hi)(A), (__v32hf)(W), \ - (__mmask32)(U), (int)(R))) - -#define _mm512_maskz_cvt_roundepi16_ph(U, A, R) \ - ((__m512h)__builtin_ia32_vcvtw2ph512_mask( \ - (__v32hi)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R))) - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_cvtepi16_ph(__m512i __A) { - return (__m512h)__builtin_ia32_vcvtw2ph512_mask( - (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)-1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtepi16_ph(__m512h __W, __mmask32 __U, __m512i __A) { - return (__m512h)__builtin_ia32_vcvtw2ph512_mask( - (__v32hi)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtepi16_ph(__mmask32 __U, __m512i __A) { - return (__m512h)__builtin_ia32_vcvtw2ph512_mask( - (__v32hi)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_cvt_roundph_epu16(A, R) \ - ((__m512i)__builtin_ia32_vcvtph2uw512_mask( \ - (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1), \ - (int)(R))) - -#define _mm512_mask_cvt_roundph_epu16(W, U, A, R) \ - ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A), (__v32hu)(W), \ - (__mmask32)(U), (int)(R))) - -#define _mm512_maskz_cvt_roundph_epu16(U, A, R) \ - ((__m512i)__builtin_ia32_vcvtph2uw512_mask((__v32hf)(A), \ - (__v32hu)_mm512_setzero_epi32(), \ - (__mmask32)(U), (int)(R))) - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_cvtph_epu16(__m512h __A) { - return (__m512i)__builtin_ia32_vcvtph2uw512_mask( - (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)-1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtph_epu16(__m512i __W, __mmask32 __U, __m512h __A) { - return (__m512i)__builtin_ia32_vcvtph2uw512_mask( - (__v32hf)__A, (__v32hu)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtph_epu16(__mmask32 __U, __m512h __A) { - return (__m512i)__builtin_ia32_vcvtph2uw512_mask( - (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_cvtt_roundph_epu16(A, R) \ - ((__m512i)__builtin_ia32_vcvttph2uw512_mask( \ - (__v32hf)(A), (__v32hu)_mm512_undefined_epi32(), (__mmask32)(-1), \ - (int)(R))) - -#define _mm512_mask_cvtt_roundph_epu16(W, U, A, R) \ - ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A), (__v32hu)(W), \ - (__mmask32)(U), (int)(R))) - -#define _mm512_maskz_cvtt_roundph_epu16(U, A, R) \ - ((__m512i)__builtin_ia32_vcvttph2uw512_mask((__v32hf)(A), \ - (__v32hu)_mm512_setzero_epi32(), \ - (__mmask32)(U), (int)(R))) - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_cvttph_epu16(__m512h __A) { - return (__m512i)__builtin_ia32_vcvttph2uw512_mask( - (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)-1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvttph_epu16(__m512i __W, __mmask32 __U, __m512h __A) { - return (__m512i)__builtin_ia32_vcvttph2uw512_mask( - (__v32hf)__A, (__v32hu)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvttph_epu16(__mmask32 __U, __m512h __A) { - return (__m512i)__builtin_ia32_vcvttph2uw512_mask( - (__v32hf)__A, (__v32hu)_mm512_setzero_epi32(), (__mmask32)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_cvt_roundepu16_ph(A, R) \ - ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A), \ - (__v32hf)_mm512_undefined_ph(), \ - (__mmask32)(-1), (int)(R))) - -#define _mm512_mask_cvt_roundepu16_ph(W, U, A, R) \ - ((__m512h)__builtin_ia32_vcvtuw2ph512_mask((__v32hu)(A), (__v32hf)(W), \ - (__mmask32)(U), (int)(R))) - -#define _mm512_maskz_cvt_roundepu16_ph(U, A, R) \ - ((__m512h)__builtin_ia32_vcvtuw2ph512_mask( \ - (__v32hu)(A), (__v32hf)_mm512_setzero_ph(), (__mmask32)(U), (int)(R))) - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_cvtepu16_ph(__m512i __A) { - return (__m512h)__builtin_ia32_vcvtuw2ph512_mask( - (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)-1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtepu16_ph(__m512h __W, __mmask32 __U, __m512i __A) { - return (__m512h)__builtin_ia32_vcvtuw2ph512_mask( - (__v32hu)__A, (__v32hf)__W, (__mmask32)__U, _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtepu16_ph(__mmask32 __U, __m512i __A) { - return (__m512h)__builtin_ia32_vcvtuw2ph512_mask( - (__v32hu)__A, (__v32hf)_mm512_setzero_ph(), (__mmask32)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_cvt_roundph_epi32(A, R) \ - ((__m512i)__builtin_ia32_vcvtph2dq512_mask( \ - (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1), \ - (int)(R))) - -#define _mm512_mask_cvt_roundph_epi32(W, U, A, R) \ - ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A), (__v16si)(W), \ - (__mmask16)(U), (int)(R))) - -#define _mm512_maskz_cvt_roundph_epi32(U, A, R) \ - ((__m512i)__builtin_ia32_vcvtph2dq512_mask((__v16hf)(A), \ - (__v16si)_mm512_setzero_epi32(), \ - (__mmask16)(U), (int)(R))) - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_cvtph_epi32(__m256h __A) { - return (__m512i)__builtin_ia32_vcvtph2dq512_mask( - (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)-1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtph_epi32(__m512i __W, __mmask16 __U, __m256h __A) { - return (__m512i)__builtin_ia32_vcvtph2dq512_mask( - (__v16hf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtph_epi32(__mmask16 __U, __m256h __A) { - return (__m512i)__builtin_ia32_vcvtph2dq512_mask( - (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_cvt_roundph_epu32(A, R) \ - ((__m512i)__builtin_ia32_vcvtph2udq512_mask( \ - (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1), \ - (int)(R))) - -#define _mm512_mask_cvt_roundph_epu32(W, U, A, R) \ - ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A), (__v16su)(W), \ - (__mmask16)(U), (int)(R))) - -#define _mm512_maskz_cvt_roundph_epu32(U, A, R) \ - ((__m512i)__builtin_ia32_vcvtph2udq512_mask((__v16hf)(A), \ - (__v16su)_mm512_setzero_epi32(), \ - (__mmask16)(U), (int)(R))) - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_cvtph_epu32(__m256h __A) { - return (__m512i)__builtin_ia32_vcvtph2udq512_mask( - (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)-1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtph_epu32(__m512i __W, __mmask16 __U, __m256h __A) { - return (__m512i)__builtin_ia32_vcvtph2udq512_mask( - (__v16hf)__A, (__v16su)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtph_epu32(__mmask16 __U, __m256h __A) { - return (__m512i)__builtin_ia32_vcvtph2udq512_mask( - (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_cvt_roundepi32_ph(A, R) \ - ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A), \ - (__v16hf)_mm256_undefined_ph(), \ - (__mmask16)(-1), (int)(R))) - -#define _mm512_mask_cvt_roundepi32_ph(W, U, A, R) \ - ((__m256h)__builtin_ia32_vcvtdq2ph512_mask((__v16si)(A), (__v16hf)(W), \ - (__mmask16)(U), (int)(R))) - -#define _mm512_maskz_cvt_roundepi32_ph(U, A, R) \ - ((__m256h)__builtin_ia32_vcvtdq2ph512_mask( \ - (__v16si)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R))) - -static __inline__ __m256h __DEFAULT_FN_ATTRS512 -_mm512_cvtepi32_ph(__m512i __A) { - return (__m256h)__builtin_ia32_vcvtdq2ph512_mask( - (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtepi32_ph(__m256h __W, __mmask16 __U, __m512i __A) { - return (__m256h)__builtin_ia32_vcvtdq2ph512_mask( - (__v16si)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtepi32_ph(__mmask16 __U, __m512i __A) { - return (__m256h)__builtin_ia32_vcvtdq2ph512_mask( - (__v16si)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_cvt_roundepu32_ph(A, R) \ - ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A), \ - (__v16hf)_mm256_undefined_ph(), \ - (__mmask16)(-1), (int)(R))) - -#define _mm512_mask_cvt_roundepu32_ph(W, U, A, R) \ - ((__m256h)__builtin_ia32_vcvtudq2ph512_mask((__v16su)(A), (__v16hf)(W), \ - (__mmask16)(U), (int)(R))) - -#define _mm512_maskz_cvt_roundepu32_ph(U, A, R) \ - ((__m256h)__builtin_ia32_vcvtudq2ph512_mask( \ - (__v16su)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R))) - -static __inline__ __m256h __DEFAULT_FN_ATTRS512 -_mm512_cvtepu32_ph(__m512i __A) { - return (__m256h)__builtin_ia32_vcvtudq2ph512_mask( - (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtepu32_ph(__m256h __W, __mmask16 __U, __m512i __A) { - return (__m256h)__builtin_ia32_vcvtudq2ph512_mask( - (__v16su)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtepu32_ph(__mmask16 __U, __m512i __A) { - return (__m256h)__builtin_ia32_vcvtudq2ph512_mask( - (__v16su)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_cvtt_roundph_epi32(A, R) \ - ((__m512i)__builtin_ia32_vcvttph2dq512_mask( \ - (__v16hf)(A), (__v16si)_mm512_undefined_epi32(), (__mmask16)(-1), \ - (int)(R))) - -#define _mm512_mask_cvtt_roundph_epi32(W, U, A, R) \ - ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A), (__v16si)(W), \ - (__mmask16)(U), (int)(R))) - -#define _mm512_maskz_cvtt_roundph_epi32(U, A, R) \ - ((__m512i)__builtin_ia32_vcvttph2dq512_mask((__v16hf)(A), \ - (__v16si)_mm512_setzero_epi32(), \ - (__mmask16)(U), (int)(R))) - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_cvttph_epi32(__m256h __A) { - return (__m512i)__builtin_ia32_vcvttph2dq512_mask( - (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)-1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvttph_epi32(__m512i __W, __mmask16 __U, __m256h __A) { - return (__m512i)__builtin_ia32_vcvttph2dq512_mask( - (__v16hf)__A, (__v16si)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvttph_epi32(__mmask16 __U, __m256h __A) { - return (__m512i)__builtin_ia32_vcvttph2dq512_mask( - (__v16hf)__A, (__v16si)_mm512_setzero_epi32(), (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_cvtt_roundph_epu32(A, R) \ - ((__m512i)__builtin_ia32_vcvttph2udq512_mask( \ - (__v16hf)(A), (__v16su)_mm512_undefined_epi32(), (__mmask16)(-1), \ - (int)(R))) - -#define _mm512_mask_cvtt_roundph_epu32(W, U, A, R) \ - ((__m512i)__builtin_ia32_vcvttph2udq512_mask((__v16hf)(A), (__v16su)(W), \ - (__mmask16)(U), (int)(R))) - -#define _mm512_maskz_cvtt_roundph_epu32(U, A, R) \ - ((__m512i)__builtin_ia32_vcvttph2udq512_mask( \ - (__v16hf)(A), (__v16su)_mm512_setzero_epi32(), (__mmask16)(U), \ - (int)(R))) - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_cvttph_epu32(__m256h __A) { - return (__m512i)__builtin_ia32_vcvttph2udq512_mask( - (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)-1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvttph_epu32(__m512i __W, __mmask16 __U, __m256h __A) { - return (__m512i)__builtin_ia32_vcvttph2udq512_mask( - (__v16hf)__A, (__v16su)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvttph_epu32(__mmask16 __U, __m256h __A) { - return (__m512i)__builtin_ia32_vcvttph2udq512_mask( - (__v16hf)__A, (__v16su)_mm512_setzero_epi32(), (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_cvt_roundepi64_ph(A, R) \ - ((__m128h)__builtin_ia32_vcvtqq2ph512_mask( \ - (__v8di)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R))) - -#define _mm512_mask_cvt_roundepi64_ph(W, U, A, R) \ - ((__m128h)__builtin_ia32_vcvtqq2ph512_mask((__v8di)(A), (__v8hf)(W), \ - (__mmask8)(U), (int)(R))) - -#define _mm512_maskz_cvt_roundepi64_ph(U, A, R) \ - ((__m128h)__builtin_ia32_vcvtqq2ph512_mask( \ - (__v8di)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R))) - -static __inline__ __m128h __DEFAULT_FN_ATTRS512 -_mm512_cvtepi64_ph(__m512i __A) { - return (__m128h)__builtin_ia32_vcvtqq2ph512_mask( - (__v8di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtepi64_ph(__m128h __W, __mmask8 __U, __m512i __A) { - return (__m128h)__builtin_ia32_vcvtqq2ph512_mask( - (__v8di)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtepi64_ph(__mmask8 __U, __m512i __A) { - return (__m128h)__builtin_ia32_vcvtqq2ph512_mask( - (__v8di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_cvt_roundph_epi64(A, R) \ - ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A), \ - (__v8di)_mm512_undefined_epi32(), \ - (__mmask8)(-1), (int)(R))) - -#define _mm512_mask_cvt_roundph_epi64(W, U, A, R) \ - ((__m512i)__builtin_ia32_vcvtph2qq512_mask((__v8hf)(A), (__v8di)(W), \ - (__mmask8)(U), (int)(R))) - -#define _mm512_maskz_cvt_roundph_epi64(U, A, R) \ - ((__m512i)__builtin_ia32_vcvtph2qq512_mask( \ - (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R))) - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_cvtph_epi64(__m128h __A) { - return (__m512i)__builtin_ia32_vcvtph2qq512_mask( - (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtph_epi64(__m512i __W, __mmask8 __U, __m128h __A) { - return (__m512i)__builtin_ia32_vcvtph2qq512_mask( - (__v8hf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtph_epi64(__mmask8 __U, __m128h __A) { - return (__m512i)__builtin_ia32_vcvtph2qq512_mask( - (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_cvt_roundepu64_ph(A, R) \ - ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask( \ - (__v8du)(A), (__v8hf)_mm_undefined_ph(), (__mmask8)(-1), (int)(R))) - -#define _mm512_mask_cvt_roundepu64_ph(W, U, A, R) \ - ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask((__v8du)(A), (__v8hf)(W), \ - (__mmask8)(U), (int)(R))) - -#define _mm512_maskz_cvt_roundepu64_ph(U, A, R) \ - ((__m128h)__builtin_ia32_vcvtuqq2ph512_mask( \ - (__v8du)(A), (__v8hf)_mm_setzero_ph(), (__mmask8)(U), (int)(R))) - -static __inline__ __m128h __DEFAULT_FN_ATTRS512 -_mm512_cvtepu64_ph(__m512i __A) { - return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask( - (__v8du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtepu64_ph(__m128h __W, __mmask8 __U, __m512i __A) { - return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask( - (__v8du)__A, (__v8hf)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtepu64_ph(__mmask8 __U, __m512i __A) { - return (__m128h)__builtin_ia32_vcvtuqq2ph512_mask( - (__v8du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_cvt_roundph_epu64(A, R) \ - ((__m512i)__builtin_ia32_vcvtph2uqq512_mask( \ - (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1), \ - (int)(R))) - -#define _mm512_mask_cvt_roundph_epu64(W, U, A, R) \ - ((__m512i)__builtin_ia32_vcvtph2uqq512_mask((__v8hf)(A), (__v8du)(W), \ - (__mmask8)(U), (int)(R))) - -#define _mm512_maskz_cvt_roundph_epu64(U, A, R) \ - ((__m512i)__builtin_ia32_vcvtph2uqq512_mask( \ - (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R))) - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_cvtph_epu64(__m128h __A) { - return (__m512i)__builtin_ia32_vcvtph2uqq512_mask( - (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtph_epu64(__m512i __W, __mmask8 __U, __m128h __A) { - return (__m512i)__builtin_ia32_vcvtph2uqq512_mask( - (__v8hf)__A, (__v8du)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtph_epu64(__mmask8 __U, __m128h __A) { - return (__m512i)__builtin_ia32_vcvtph2uqq512_mask( - (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_cvtt_roundph_epi64(A, R) \ - ((__m512i)__builtin_ia32_vcvttph2qq512_mask( \ - (__v8hf)(A), (__v8di)_mm512_undefined_epi32(), (__mmask8)(-1), \ - (int)(R))) - -#define _mm512_mask_cvtt_roundph_epi64(W, U, A, R) \ - ((__m512i)__builtin_ia32_vcvttph2qq512_mask((__v8hf)(A), (__v8di)(W), \ - (__mmask8)(U), (int)(R))) - -#define _mm512_maskz_cvtt_roundph_epi64(U, A, R) \ - ((__m512i)__builtin_ia32_vcvttph2qq512_mask( \ - (__v8hf)(A), (__v8di)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R))) - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_cvttph_epi64(__m128h __A) { - return (__m512i)__builtin_ia32_vcvttph2qq512_mask( - (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvttph_epi64(__m512i __W, __mmask8 __U, __m128h __A) { - return (__m512i)__builtin_ia32_vcvttph2qq512_mask( - (__v8hf)__A, (__v8di)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvttph_epi64(__mmask8 __U, __m128h __A) { - return (__m512i)__builtin_ia32_vcvttph2qq512_mask( - (__v8hf)__A, (__v8di)_mm512_setzero_epi32(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_cvtt_roundph_epu64(A, R) \ - ((__m512i)__builtin_ia32_vcvttph2uqq512_mask( \ - (__v8hf)(A), (__v8du)_mm512_undefined_epi32(), (__mmask8)(-1), \ - (int)(R))) - -#define _mm512_mask_cvtt_roundph_epu64(W, U, A, R) \ - ((__m512i)__builtin_ia32_vcvttph2uqq512_mask((__v8hf)(A), (__v8du)(W), \ - (__mmask8)(U), (int)(R))) - -#define _mm512_maskz_cvtt_roundph_epu64(U, A, R) \ - ((__m512i)__builtin_ia32_vcvttph2uqq512_mask( \ - (__v8hf)(A), (__v8du)_mm512_setzero_epi32(), (__mmask8)(U), (int)(R))) - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_cvttph_epu64(__m128h __A) { - return (__m512i)__builtin_ia32_vcvttph2uqq512_mask( - (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_mask_cvttph_epu64(__m512i __W, __mmask8 __U, __m128h __A) { - return (__m512i)__builtin_ia32_vcvttph2uqq512_mask( - (__v8hf)__A, (__v8du)__W, (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvttph_epu64(__mmask8 __U, __m128h __A) { - return (__m512i)__builtin_ia32_vcvttph2uqq512_mask( - (__v8hf)__A, (__v8du)_mm512_setzero_epi32(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_cvt_roundsh_i32(A, R) \ - ((int)__builtin_ia32_vcvtsh2si32((__v8hf)(A), (int)(R))) - -static __inline__ int __DEFAULT_FN_ATTRS128 _mm_cvtsh_i32(__m128h __A) { - return (int)__builtin_ia32_vcvtsh2si32((__v8hf)__A, _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_cvt_roundsh_u32(A, R) \ - ((unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)(A), (int)(R))) - -static __inline__ unsigned int __DEFAULT_FN_ATTRS128 -_mm_cvtsh_u32(__m128h __A) { - return (unsigned int)__builtin_ia32_vcvtsh2usi32((__v8hf)__A, - _MM_FROUND_CUR_DIRECTION); -} - -#ifdef __x86_64__ -#define _mm_cvt_roundsh_i64(A, R) \ - ((long long)__builtin_ia32_vcvtsh2si64((__v8hf)(A), (int)(R))) - -static __inline__ long long __DEFAULT_FN_ATTRS128 _mm_cvtsh_i64(__m128h __A) { - return (long long)__builtin_ia32_vcvtsh2si64((__v8hf)__A, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_cvt_roundsh_u64(A, R) \ - ((unsigned long long)__builtin_ia32_vcvtsh2usi64((__v8hf)(A), (int)(R))) - -static __inline__ unsigned long long __DEFAULT_FN_ATTRS128 -_mm_cvtsh_u64(__m128h __A) { - return (unsigned long long)__builtin_ia32_vcvtsh2usi64( - (__v8hf)__A, _MM_FROUND_CUR_DIRECTION); -} -#endif // __x86_64__ - -#define _mm_cvt_roundu32_sh(A, B, R) \ - ((__m128h)__builtin_ia32_vcvtusi2sh((__v8hf)(A), (unsigned int)(B), (int)(R))) - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_cvtu32_sh(__m128h __A, unsigned int __B) { - __A[0] = __B; - return __A; -} - -#ifdef __x86_64__ -#define _mm_cvt_roundu64_sh(A, B, R) \ - ((__m128h)__builtin_ia32_vcvtusi642sh((__v8hf)(A), (unsigned long long)(B), \ - (int)(R))) - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_cvtu64_sh(__m128h __A, unsigned long long __B) { - __A[0] = __B; - return __A; -} -#endif - -#define _mm_cvt_roundi32_sh(A, B, R) \ - ((__m128h)__builtin_ia32_vcvtsi2sh((__v8hf)(A), (int)(B), (int)(R))) - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvti32_sh(__m128h __A, - int __B) { - __A[0] = __B; - return __A; -} - -#ifdef __x86_64__ -#define _mm_cvt_roundi64_sh(A, B, R) \ - ((__m128h)__builtin_ia32_vcvtsi642sh((__v8hf)(A), (long long)(B), (int)(R))) - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvti64_sh(__m128h __A, - long long __B) { - __A[0] = __B; - return __A; -} -#endif - -#define _mm_cvtt_roundsh_i32(A, R) \ - ((int)__builtin_ia32_vcvttsh2si32((__v8hf)(A), (int)(R))) - -static __inline__ int __DEFAULT_FN_ATTRS128 _mm_cvttsh_i32(__m128h __A) { - return (int)__builtin_ia32_vcvttsh2si32((__v8hf)__A, - _MM_FROUND_CUR_DIRECTION); -} - -#ifdef __x86_64__ -#define _mm_cvtt_roundsh_i64(A, R) \ - ((long long)__builtin_ia32_vcvttsh2si64((__v8hf)(A), (int)(R))) - -static __inline__ long long __DEFAULT_FN_ATTRS128 _mm_cvttsh_i64(__m128h __A) { - return (long long)__builtin_ia32_vcvttsh2si64((__v8hf)__A, - _MM_FROUND_CUR_DIRECTION); -} -#endif - -#define _mm_cvtt_roundsh_u32(A, R) \ - ((unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)(A), (int)(R))) - -static __inline__ unsigned int __DEFAULT_FN_ATTRS128 -_mm_cvttsh_u32(__m128h __A) { - return (unsigned int)__builtin_ia32_vcvttsh2usi32((__v8hf)__A, - _MM_FROUND_CUR_DIRECTION); -} - -#ifdef __x86_64__ -#define _mm_cvtt_roundsh_u64(A, R) \ - ((unsigned long long)__builtin_ia32_vcvttsh2usi64((__v8hf)(A), (int)(R))) - -static __inline__ unsigned long long __DEFAULT_FN_ATTRS128 -_mm_cvttsh_u64(__m128h __A) { - return (unsigned long long)__builtin_ia32_vcvttsh2usi64( - (__v8hf)__A, _MM_FROUND_CUR_DIRECTION); -} -#endif - -#define _mm512_cvtx_roundph_ps(A, R) \ - ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A), \ - (__v16sf)_mm512_undefined_ps(), \ - (__mmask16)(-1), (int)(R))) - -#define _mm512_mask_cvtx_roundph_ps(W, U, A, R) \ - ((__m512)__builtin_ia32_vcvtph2psx512_mask((__v16hf)(A), (__v16sf)(W), \ - (__mmask16)(U), (int)(R))) - -#define _mm512_maskz_cvtx_roundph_ps(U, A, R) \ - ((__m512)__builtin_ia32_vcvtph2psx512_mask( \ - (__v16hf)(A), (__v16sf)_mm512_setzero_ps(), (__mmask16)(U), (int)(R))) - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 _mm512_cvtxph_ps(__m256h __A) { - return (__m512)__builtin_ia32_vcvtph2psx512_mask( - (__v16hf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)-1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtxph_ps(__m512 __W, __mmask16 __U, __m256h __A) { - return (__m512)__builtin_ia32_vcvtph2psx512_mask( - (__v16hf)__A, (__v16sf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512 __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtxph_ps(__mmask16 __U, __m256h __A) { - return (__m512)__builtin_ia32_vcvtph2psx512_mask( - (__v16hf)__A, (__v16sf)_mm512_setzero_ps(), (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_cvtx_roundps_ph(A, R) \ - ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A), \ - (__v16hf)_mm256_undefined_ph(), \ - (__mmask16)(-1), (int)(R))) - -#define _mm512_mask_cvtx_roundps_ph(W, U, A, R) \ - ((__m256h)__builtin_ia32_vcvtps2phx512_mask((__v16sf)(A), (__v16hf)(W), \ - (__mmask16)(U), (int)(R))) - -#define _mm512_maskz_cvtx_roundps_ph(U, A, R) \ - ((__m256h)__builtin_ia32_vcvtps2phx512_mask( \ - (__v16sf)(A), (__v16hf)_mm256_setzero_ph(), (__mmask16)(U), (int)(R))) - -static __inline__ __m256h __DEFAULT_FN_ATTRS512 _mm512_cvtxps_ph(__m512 __A) { - return (__m256h)__builtin_ia32_vcvtps2phx512_mask( - (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS512 -_mm512_mask_cvtxps_ph(__m256h __W, __mmask16 __U, __m512 __A) { - return (__m256h)__builtin_ia32_vcvtps2phx512_mask( - (__v16sf)__A, (__v16hf)__W, (__mmask16)__U, _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS512 -_mm512_maskz_cvtxps_ph(__mmask16 __U, __m512 __A) { - return (__m256h)__builtin_ia32_vcvtps2phx512_mask( - (__v16sf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_fmadd_round_ph(A, B, C, R) \ - ((__m512h)__builtin_ia32_vfmaddph512_mask( \ - (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ - (__mmask32)-1, (int)(R))) - -#define _mm512_mask_fmadd_round_ph(A, U, B, C, R) \ - ((__m512h)__builtin_ia32_vfmaddph512_mask( \ - (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ - (__mmask32)(U), (int)(R))) - -#define _mm512_mask3_fmadd_round_ph(A, B, C, U, R) \ - ((__m512h)__builtin_ia32_vfmaddph512_mask3( \ - (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ - (__mmask32)(U), (int)(R))) - -#define _mm512_maskz_fmadd_round_ph(U, A, B, C, R) \ - ((__m512h)__builtin_ia32_vfmaddph512_maskz( \ - (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ - (__mmask32)(U), (int)(R))) - -#define _mm512_fmsub_round_ph(A, B, C, R) \ - ((__m512h)__builtin_ia32_vfmaddph512_mask( \ - (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \ - (__mmask32)-1, (int)(R))) - -#define _mm512_mask_fmsub_round_ph(A, U, B, C, R) \ - ((__m512h)__builtin_ia32_vfmaddph512_mask( \ - (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \ - (__mmask32)(U), (int)(R))) - -#define _mm512_maskz_fmsub_round_ph(U, A, B, C, R) \ - ((__m512h)__builtin_ia32_vfmaddph512_maskz( \ - (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \ - (__mmask32)(U), (int)(R))) - -#define _mm512_fnmadd_round_ph(A, B, C, R) \ - ((__m512h)__builtin_ia32_vfmaddph512_mask( \ - (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ - (__mmask32)-1, (int)(R))) - -#define _mm512_mask3_fnmadd_round_ph(A, B, C, U, R) \ - ((__m512h)__builtin_ia32_vfmaddph512_mask3( \ - -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ - (__mmask32)(U), (int)(R))) - -#define _mm512_maskz_fnmadd_round_ph(U, A, B, C, R) \ - ((__m512h)__builtin_ia32_vfmaddph512_maskz( \ - -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ - (__mmask32)(U), (int)(R))) - -#define _mm512_fnmsub_round_ph(A, B, C, R) \ - ((__m512h)__builtin_ia32_vfmaddph512_mask( \ - (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \ - (__mmask32)-1, (int)(R))) - -#define _mm512_maskz_fnmsub_round_ph(U, A, B, C, R) \ - ((__m512h)__builtin_ia32_vfmaddph512_maskz( \ - -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \ - (__mmask32)(U), (int)(R))) - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmadd_ph(__m512h __A, - __m512h __B, - __m512h __C) { - return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B, - (__v32hf)__C, (__mmask32)-1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_mask_fmadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) { - return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B, - (__v32hf)__C, (__mmask32)__U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_mask3_fmadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) { - return (__m512h)__builtin_ia32_vfmaddph512_mask3((__v32hf)__A, (__v32hf)__B, - (__v32hf)__C, (__mmask32)__U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_maskz_fmadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) { - return (__m512h)__builtin_ia32_vfmaddph512_maskz((__v32hf)__A, (__v32hf)__B, - (__v32hf)__C, (__mmask32)__U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmsub_ph(__m512h __A, - __m512h __B, - __m512h __C) { - return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B, - -(__v32hf)__C, (__mmask32)-1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_mask_fmsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) { - return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, (__v32hf)__B, - -(__v32hf)__C, (__mmask32)__U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_maskz_fmsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) { - return (__m512h)__builtin_ia32_vfmaddph512_maskz( - (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fnmadd_ph(__m512h __A, - __m512h __B, - __m512h __C) { - return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B, - (__v32hf)__C, (__mmask32)-1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_mask3_fnmadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) { - return (__m512h)__builtin_ia32_vfmaddph512_mask3(-(__v32hf)__A, (__v32hf)__B, - (__v32hf)__C, (__mmask32)__U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_maskz_fnmadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) { - return (__m512h)__builtin_ia32_vfmaddph512_maskz(-(__v32hf)__A, (__v32hf)__B, - (__v32hf)__C, (__mmask32)__U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fnmsub_ph(__m512h __A, - __m512h __B, - __m512h __C) { - return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B, - -(__v32hf)__C, (__mmask32)-1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_maskz_fnmsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) { - return (__m512h)__builtin_ia32_vfmaddph512_maskz( - -(__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_fmaddsub_round_ph(A, B, C, R) \ - ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \ - (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ - (__mmask32)-1, (int)(R))) - -#define _mm512_mask_fmaddsub_round_ph(A, U, B, C, R) \ - ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \ - (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ - (__mmask32)(U), (int)(R))) - -#define _mm512_mask3_fmaddsub_round_ph(A, B, C, U, R) \ - ((__m512h)__builtin_ia32_vfmaddsubph512_mask3( \ - (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ - (__mmask32)(U), (int)(R))) - -#define _mm512_maskz_fmaddsub_round_ph(U, A, B, C, R) \ - ((__m512h)__builtin_ia32_vfmaddsubph512_maskz( \ - (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ - (__mmask32)(U), (int)(R))) - -#define _mm512_fmsubadd_round_ph(A, B, C, R) \ - ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \ - (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \ - (__mmask32)-1, (int)(R))) - -#define _mm512_mask_fmsubadd_round_ph(A, U, B, C, R) \ - ((__m512h)__builtin_ia32_vfmaddsubph512_mask( \ - (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \ - (__mmask32)(U), (int)(R))) - -#define _mm512_maskz_fmsubadd_round_ph(U, A, B, C, R) \ - ((__m512h)__builtin_ia32_vfmaddsubph512_maskz( \ - (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \ - (__mmask32)(U), (int)(R))) - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C) { - return (__m512h)__builtin_ia32_vfmaddsubph512_mask( - (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)-1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_mask_fmaddsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) { - return (__m512h)__builtin_ia32_vfmaddsubph512_mask( - (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_mask3_fmaddsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) { - return (__m512h)__builtin_ia32_vfmaddsubph512_mask3( - (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_maskz_fmaddsub_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) { - return (__m512h)__builtin_ia32_vfmaddsubph512_maskz( - (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C) { - return (__m512h)__builtin_ia32_vfmaddsubph512_mask( - (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)-1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_mask_fmsubadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) { - return (__m512h)__builtin_ia32_vfmaddsubph512_mask( - (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_maskz_fmsubadd_ph(__mmask32 __U, __m512h __A, __m512h __B, __m512h __C) { - return (__m512h)__builtin_ia32_vfmaddsubph512_maskz( - (__v32hf)__A, (__v32hf)__B, -(__v32hf)__C, (__mmask32)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_mask3_fmsub_round_ph(A, B, C, U, R) \ - ((__m512h)__builtin_ia32_vfmsubph512_mask3( \ - (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ - (__mmask32)(U), (int)(R))) - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_mask3_fmsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) { - return (__m512h)__builtin_ia32_vfmsubph512_mask3((__v32hf)__A, (__v32hf)__B, - (__v32hf)__C, (__mmask32)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_mask3_fmsubadd_round_ph(A, B, C, U, R) \ - ((__m512h)__builtin_ia32_vfmsubaddph512_mask3( \ - (__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ - (__mmask32)(U), (int)(R))) - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_mask3_fmsubadd_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) { - return (__m512h)__builtin_ia32_vfmsubaddph512_mask3( - (__v32hf)__A, (__v32hf)__B, (__v32hf)__C, (__mmask32)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_mask_fnmadd_round_ph(A, U, B, C, R) \ - ((__m512h)__builtin_ia32_vfmaddph512_mask( \ - (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ - (__mmask32)(U), (int)(R))) - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_mask_fnmadd_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) { - return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B, - (__v32hf)__C, (__mmask32)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_mask_fnmsub_round_ph(A, U, B, C, R) \ - ((__m512h)__builtin_ia32_vfmaddph512_mask( \ - (__v32hf)(__m512h)(A), -(__v32hf)(__m512h)(B), -(__v32hf)(__m512h)(C), \ - (__mmask32)(U), (int)(R))) - -#define _mm512_mask3_fnmsub_round_ph(A, B, C, U, R) \ - ((__m512h)__builtin_ia32_vfmsubph512_mask3( \ - -(__v32hf)(__m512h)(A), (__v32hf)(__m512h)(B), (__v32hf)(__m512h)(C), \ - (__mmask32)(U), (int)(R))) - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_mask_fnmsub_ph(__m512h __A, __mmask32 __U, __m512h __B, __m512h __C) { - return (__m512h)__builtin_ia32_vfmaddph512_mask((__v32hf)__A, -(__v32hf)__B, - -(__v32hf)__C, (__mmask32)__U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_mask3_fnmsub_ph(__m512h __A, __m512h __B, __m512h __C, __mmask32 __U) { - return (__m512h)__builtin_ia32_vfmsubph512_mask3(-(__v32hf)__A, (__v32hf)__B, - (__v32hf)__C, (__mmask32)__U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_sh(__m128h __W, - __m128h __A, - __m128h __B) { - return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B, - (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmadd_sh(__m128h __W, - __mmask8 __U, - __m128h __A, - __m128h __B) { - return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, (__v8hf)__B, - (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_fmadd_round_sh(A, B, C, R) \ - ((__m128h)__builtin_ia32_vfmaddsh3_mask( \ - (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \ - (__mmask8)-1, (int)(R))) - -#define _mm_mask_fmadd_round_sh(W, U, A, B, R) \ - ((__m128h)__builtin_ia32_vfmaddsh3_mask( \ - (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), \ - (__mmask8)(U), (int)(R))) - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_maskz_fmadd_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { - return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B, (__v8hf)__C, - (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_maskz_fmadd_round_sh(U, A, B, C, R) \ - ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \ - (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \ - (__mmask8)(U), (int)(R))) - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_mask3_fmadd_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) { - return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)__Y, - (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_mask3_fmadd_round_sh(W, X, Y, U, R) \ - ((__m128h)__builtin_ia32_vfmaddsh3_mask3( \ - (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \ - (__mmask8)(U), (int)(R))) - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsub_sh(__m128h __W, - __m128h __A, - __m128h __B) { - return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, - -(__v8hf)__B, (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmsub_sh(__m128h __W, - __mmask8 __U, - __m128h __A, - __m128h __B) { - return (__m128h)__builtin_ia32_vfmaddsh3_mask((__v8hf)__W, (__v8hf)__A, - -(__v8hf)__B, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_fmsub_round_sh(A, B, C, R) \ - ((__m128h)__builtin_ia32_vfmaddsh3_mask( \ - (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \ - (__mmask8)-1, (int)(R))) - -#define _mm_mask_fmsub_round_sh(W, U, A, B, R) \ - ((__m128h)__builtin_ia32_vfmaddsh3_mask( \ - (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), \ - (__mmask8)(U), (int)(R))) - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_maskz_fmsub_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { - return (__m128h)__builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, (__v8hf)__B, - -(__v8hf)__C, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_maskz_fmsub_round_sh(U, A, B, C, R) \ - ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \ - (__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \ - (__mmask8)(U), (int)R)) - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_mask3_fmsub_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) { - return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, (__v8hf)__X, (__v8hf)__Y, - (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_mask3_fmsub_round_sh(W, X, Y, U, R) \ - ((__m128h)__builtin_ia32_vfmsubsh3_mask3( \ - (__v8hf)(__m128h)(W), (__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \ - (__mmask8)(U), (int)(R))) - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmadd_sh(__m128h __W, - __m128h __A, - __m128h __B) { - return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B, - (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_mask_fnmadd_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { - return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, (__v8hf)__B, - (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_fnmadd_round_sh(A, B, C, R) \ - ((__m128h)__builtin_ia32_vfmaddsh3_mask( \ - (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \ - (__mmask8)-1, (int)(R))) - -#define _mm_mask_fnmadd_round_sh(W, U, A, B, R) \ - ((__m128h)__builtin_ia32_vfmaddsh3_mask( \ - (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), (__v8hf)(__m128h)(B), \ - (__mmask8)(U), (int)(R))) - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_maskz_fnmadd_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { - return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, (__v8hf)__C, - (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_maskz_fnmadd_round_sh(U, A, B, C, R) \ - ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \ - (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), (__v8hf)(__m128h)(C), \ - (__mmask8)(U), (int)(R))) - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_mask3_fnmadd_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) { - return __builtin_ia32_vfmaddsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)__Y, - (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_mask3_fnmadd_round_sh(W, X, Y, U, R) \ - ((__m128h)__builtin_ia32_vfmaddsh3_mask3( \ - (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \ - (__mmask8)(U), (int)(R))) - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmsub_sh(__m128h __W, - __m128h __A, - __m128h __B) { - return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B, - (__mmask8)-1, _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_mask_fnmsub_sh(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { - return __builtin_ia32_vfmaddsh3_mask((__v8hf)__W, -(__v8hf)__A, -(__v8hf)__B, - (__mmask8)__U, _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_fnmsub_round_sh(A, B, C, R) \ - ((__m128h)__builtin_ia32_vfmaddsh3_mask( \ - (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \ - (__mmask8)-1, (int)(R))) - -#define _mm_mask_fnmsub_round_sh(W, U, A, B, R) \ - ((__m128h)__builtin_ia32_vfmaddsh3_mask( \ - (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), \ - (__mmask8)(U), (int)(R))) - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_maskz_fnmsub_sh(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { - return __builtin_ia32_vfmaddsh3_maskz((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C, - (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_maskz_fnmsub_round_sh(U, A, B, C, R) \ - ((__m128h)__builtin_ia32_vfmaddsh3_maskz( \ - (__v8hf)(__m128h)(A), -(__v8hf)(__m128h)(B), -(__v8hf)(__m128h)(C), \ - (__mmask8)(U), (int)(R))) - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_mask3_fnmsub_sh(__m128h __W, __m128h __X, __m128h __Y, __mmask8 __U) { - return __builtin_ia32_vfmsubsh3_mask3((__v8hf)__W, -(__v8hf)__X, (__v8hf)__Y, - (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_mask3_fnmsub_round_sh(W, X, Y, U, R) \ - ((__m128h)__builtin_ia32_vfmsubsh3_mask3( \ - (__v8hf)(__m128h)(W), -(__v8hf)(__m128h)(X), (__v8hf)(__m128h)(Y), \ - (__mmask8)(U), (int)(R))) - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmadd_sch(__m128h __A, - __m128h __B, - __m128h __C) { - return (__m128h)__builtin_ia32_vfcmaddcsh_mask((__v4sf)__A, (__v4sf)__B, - (__v4sf)__C, (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_mask_fcmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { - return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask( - (__v4sf)__A, (__v4sf)(__B), (__v4sf)(__C), __U, _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_maskz_fcmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { - return (__m128h)__builtin_ia32_vfcmaddcsh_maskz((__v4sf)__A, (__v4sf)__B, - (__v4sf)__C, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_mask3_fcmadd_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { - return (__m128h)__builtin_ia32_vfcmaddcsh_round_mask3( - (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, __U, _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_fcmadd_round_sch(A, B, C, R) \ - ((__m128h)__builtin_ia32_vfcmaddcsh_mask( \ - (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \ - (__mmask8)-1, (int)(R))) - -#define _mm_mask_fcmadd_round_sch(A, U, B, C, R) \ - ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask( \ - (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \ - (__mmask8)(U), (int)(R))) - -#define _mm_maskz_fcmadd_round_sch(U, A, B, C, R) \ - ((__m128h)__builtin_ia32_vfcmaddcsh_maskz( \ - (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \ - (__mmask8)(U), (int)(R))) - -#define _mm_mask3_fcmadd_round_sch(A, B, C, U, R) \ - ((__m128h)__builtin_ia32_vfcmaddcsh_round_mask3( \ - (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \ - (__mmask8)(U), (int)(R))) - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_sch(__m128h __A, - __m128h __B, - __m128h __C) { - return (__m128h)__builtin_ia32_vfmaddcsh_mask((__v4sf)__A, (__v4sf)__B, - (__v4sf)__C, (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_mask_fmadd_sch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { - return (__m128h)__builtin_ia32_vfmaddcsh_round_mask( - (__v4sf)__A, (__v4sf)(__B), (__v4sf)(__C), __U, _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_maskz_fmadd_sch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { - return (__m128h)__builtin_ia32_vfmaddcsh_maskz((__v4sf)__A, (__v4sf)__B, - (__v4sf)__C, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_mask3_fmadd_sch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { - return (__m128h)__builtin_ia32_vfmaddcsh_round_mask3( - (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, __U, _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_fmadd_round_sch(A, B, C, R) \ - ((__m128h)__builtin_ia32_vfmaddcsh_mask( \ - (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \ - (__mmask8)-1, (int)(R))) - -#define _mm_mask_fmadd_round_sch(A, U, B, C, R) \ - ((__m128h)__builtin_ia32_vfmaddcsh_round_mask( \ - (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \ - (__mmask8)(U), (int)(R))) - -#define _mm_maskz_fmadd_round_sch(U, A, B, C, R) \ - ((__m128h)__builtin_ia32_vfmaddcsh_maskz( \ - (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \ - (__mmask8)(U), (int)(R))) - -#define _mm_mask3_fmadd_round_sch(A, B, C, U, R) \ - ((__m128h)__builtin_ia32_vfmaddcsh_round_mask3( \ - (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(C), \ - (__mmask8)(U), (int)(R))) - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmul_sch(__m128h __A, - __m128h __B) { - return (__m128h)__builtin_ia32_vfcmulcsh_mask( - (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_mask_fcmul_sch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { - return (__m128h)__builtin_ia32_vfcmulcsh_mask((__v4sf)__A, (__v4sf)__B, - (__v4sf)__W, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_maskz_fcmul_sch(__mmask8 __U, __m128h __A, __m128h __B) { - return (__m128h)__builtin_ia32_vfcmulcsh_mask( - (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_fcmul_round_sch(A, B, R) \ - ((__m128h)__builtin_ia32_vfcmulcsh_mask( \ - (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \ - (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R))) - -#define _mm_mask_fcmul_round_sch(W, U, A, B, R) \ - ((__m128h)__builtin_ia32_vfcmulcsh_mask( \ - (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W), \ - (__mmask8)(U), (int)(R))) - -#define _mm_maskz_fcmul_round_sch(U, A, B, R) \ - ((__m128h)__builtin_ia32_vfcmulcsh_mask( \ - (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \ - (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R))) - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmul_sch(__m128h __A, - __m128h __B) { - return (__m128h)__builtin_ia32_vfmulcsh_mask( - (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmul_sch(__m128h __W, - __mmask8 __U, - __m128h __A, - __m128h __B) { - return (__m128h)__builtin_ia32_vfmulcsh_mask((__v4sf)__A, (__v4sf)__B, - (__v4sf)__W, (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_maskz_fmul_sch(__mmask8 __U, __m128h __A, __m128h __B) { - return (__m128h)__builtin_ia32_vfmulcsh_mask( - (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm_fmul_round_sch(A, B, R) \ - ((__m128h)__builtin_ia32_vfmulcsh_mask( \ - (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \ - (__v4sf)(__m128h)_mm_undefined_ph(), (__mmask8)-1, (int)(R))) - -#define _mm_mask_fmul_round_sch(W, U, A, B, R) \ - ((__m128h)__builtin_ia32_vfmulcsh_mask( \ - (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), (__v4sf)(__m128h)(W), \ - (__mmask8)(U), (int)(R))) - -#define _mm_maskz_fmul_round_sch(U, A, B, R) \ - ((__m128h)__builtin_ia32_vfmulcsh_mask( \ - (__v4sf)(__m128h)(A), (__v4sf)(__m128h)(B), \ - (__v4sf)(__m128h)_mm_setzero_ph(), (__mmask8)(U), (int)(R))) - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fcmul_pch(__m512h __A, - __m512h __B) { - return (__m512h)__builtin_ia32_vfcmulcph512_mask( - (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (__mmask16)-1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_mask_fcmul_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) { - return (__m512h)__builtin_ia32_vfcmulcph512_mask((__v16sf)__A, (__v16sf)__B, - (__v16sf)__W, (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_maskz_fcmul_pch(__mmask16 __U, __m512h __A, __m512h __B) { - return (__m512h)__builtin_ia32_vfcmulcph512_mask( - (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_fcmul_round_pch(A, B, R) \ - ((__m512h)__builtin_ia32_vfcmulcph512_mask( \ - (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \ - (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R))) - -#define _mm512_mask_fcmul_round_pch(W, U, A, B, R) \ - ((__m512h)__builtin_ia32_vfcmulcph512_mask( \ - (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W), \ - (__mmask16)(U), (int)(R))) - -#define _mm512_maskz_fcmul_round_pch(U, A, B, R) \ - ((__m512h)__builtin_ia32_vfcmulcph512_mask( \ - (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \ - (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R))) - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmul_pch(__m512h __A, - __m512h __B) { - return (__m512h)__builtin_ia32_vfmulcph512_mask( - (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_undefined_ph(), (__mmask16)-1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_mask_fmul_pch(__m512h __W, __mmask16 __U, __m512h __A, __m512h __B) { - return (__m512h)__builtin_ia32_vfmulcph512_mask((__v16sf)__A, (__v16sf)__B, - (__v16sf)__W, (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_maskz_fmul_pch(__mmask16 __U, __m512h __A, __m512h __B) { - return (__m512h)__builtin_ia32_vfmulcph512_mask( - (__v16sf)__A, (__v16sf)__B, (__v16sf)_mm512_setzero_ph(), (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_fmul_round_pch(A, B, R) \ - ((__m512h)__builtin_ia32_vfmulcph512_mask( \ - (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \ - (__v16sf)(__m512h)_mm512_undefined_ph(), (__mmask16)-1, (int)(R))) - -#define _mm512_mask_fmul_round_pch(W, U, A, B, R) \ - ((__m512h)__builtin_ia32_vfmulcph512_mask( \ - (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(W), \ - (__mmask16)(U), (int)(R))) - -#define _mm512_maskz_fmul_round_pch(U, A, B, R) \ - ((__m512h)__builtin_ia32_vfmulcph512_mask( \ - (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), \ - (__v16sf)(__m512h)_mm512_setzero_ph(), (__mmask16)(U), (int)(R))) - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fcmadd_pch(__m512h __A, - __m512h __B, - __m512h __C) { - return (__m512h)__builtin_ia32_vfcmaddcph512_mask3( - (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)-1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_mask_fcmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) { - return (__m512h)__builtin_ia32_vfcmaddcph512_mask( - (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_mask3_fcmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) { - return (__m512h)__builtin_ia32_vfcmaddcph512_mask3( - (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_maskz_fcmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) { - return (__m512h)__builtin_ia32_vfcmaddcph512_maskz( - (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_fcmadd_round_pch(A, B, C, R) \ - ((__m512h)__builtin_ia32_vfcmaddcph512_mask3( \ - (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \ - (__mmask16)-1, (int)(R))) - -#define _mm512_mask_fcmadd_round_pch(A, U, B, C, R) \ - ((__m512h)__builtin_ia32_vfcmaddcph512_mask( \ - (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \ - (__mmask16)(U), (int)(R))) - -#define _mm512_mask3_fcmadd_round_pch(A, B, C, U, R) \ - ((__m512h)__builtin_ia32_vfcmaddcph512_mask3( \ - (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \ - (__mmask16)(U), (int)(R))) - -#define _mm512_maskz_fcmadd_round_pch(U, A, B, C, R) \ - ((__m512h)__builtin_ia32_vfcmaddcph512_maskz( \ - (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \ - (__mmask16)(U), (int)(R))) - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 _mm512_fmadd_pch(__m512h __A, - __m512h __B, - __m512h __C) { - return (__m512h)__builtin_ia32_vfmaddcph512_mask3((__v16sf)__A, (__v16sf)__B, - (__v16sf)__C, (__mmask16)-1, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_mask_fmadd_pch(__m512h __A, __mmask16 __U, __m512h __B, __m512h __C) { - return (__m512h)__builtin_ia32_vfmaddcph512_mask((__v16sf)__A, (__v16sf)__B, - (__v16sf)__C, (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_mask3_fmadd_pch(__m512h __A, __m512h __B, __m512h __C, __mmask16 __U) { - return (__m512h)__builtin_ia32_vfmaddcph512_mask3( - (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_maskz_fmadd_pch(__mmask16 __U, __m512h __A, __m512h __B, __m512h __C) { - return (__m512h)__builtin_ia32_vfmaddcph512_maskz( - (__v16sf)__A, (__v16sf)__B, (__v16sf)__C, (__mmask16)__U, - _MM_FROUND_CUR_DIRECTION); -} - -#define _mm512_fmadd_round_pch(A, B, C, R) \ - ((__m512h)__builtin_ia32_vfmaddcph512_mask3( \ - (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \ - (__mmask16)-1, (int)(R))) - -#define _mm512_mask_fmadd_round_pch(A, U, B, C, R) \ - ((__m512h)__builtin_ia32_vfmaddcph512_mask( \ - (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \ - (__mmask16)(U), (int)(R))) - -#define _mm512_mask3_fmadd_round_pch(A, B, C, U, R) \ - ((__m512h)__builtin_ia32_vfmaddcph512_mask3( \ - (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \ - (__mmask16)(U), (int)(R))) - -#define _mm512_maskz_fmadd_round_pch(U, A, B, C, R) \ - ((__m512h)__builtin_ia32_vfmaddcph512_maskz( \ - (__v16sf)(__m512h)(A), (__v16sf)(__m512h)(B), (__v16sf)(__m512h)(C), \ - (__mmask16)(U), (int)(R))) - -static __inline__ _Float16 __DEFAULT_FN_ATTRS512 -_mm512_reduce_add_ph(__m512h __W) { - return __builtin_ia32_reduce_fadd_ph512(-0.0f16, __W); -} - -static __inline__ _Float16 __DEFAULT_FN_ATTRS512 -_mm512_reduce_mul_ph(__m512h __W) { - return __builtin_ia32_reduce_fmul_ph512(1.0f16, __W); -} - -static __inline__ _Float16 __DEFAULT_FN_ATTRS512 -_mm512_reduce_max_ph(__m512h __V) { - return __builtin_ia32_reduce_fmax_ph512(__V); -} - -static __inline__ _Float16 __DEFAULT_FN_ATTRS512 -_mm512_reduce_min_ph(__m512h __V) { - return __builtin_ia32_reduce_fmin_ph512(__V); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_mask_blend_ph(__mmask32 __U, __m512h __A, __m512h __W) { - return (__m512h)__builtin_ia32_selectph_512((__mmask32)__U, (__v32hf)__W, - (__v32hf)__A); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_permutex2var_ph(__m512h __A, __m512i __I, __m512h __B) { - return (__m512h)__builtin_ia32_vpermi2varhi512((__v32hi)__A, (__v32hi)__I, - (__v32hi)__B); -} - -static __inline__ __m512h __DEFAULT_FN_ATTRS512 -_mm512_permutexvar_ph(__m512i __A, __m512h __B) { - return (__m512h)__builtin_ia32_permvarhi512((__v32hi)__B, (__v32hi)__A); -} - -// intrinsics below are alias for f*mul_*ch -#define _mm512_mul_pch(A, B) _mm512_fmul_pch(A, B) -#define _mm512_mask_mul_pch(W, U, A, B) _mm512_mask_fmul_pch(W, U, A, B) -#define _mm512_maskz_mul_pch(U, A, B) _mm512_maskz_fmul_pch(U, A, B) -#define _mm512_mul_round_pch(A, B, R) _mm512_fmul_round_pch(A, B, R) -#define _mm512_mask_mul_round_pch(W, U, A, B, R) \ - _mm512_mask_fmul_round_pch(W, U, A, B, R) -#define _mm512_maskz_mul_round_pch(U, A, B, R) \ - _mm512_maskz_fmul_round_pch(U, A, B, R) - -#define _mm512_cmul_pch(A, B) _mm512_fcmul_pch(A, B) -#define _mm512_mask_cmul_pch(W, U, A, B) _mm512_mask_fcmul_pch(W, U, A, B) -#define _mm512_maskz_cmul_pch(U, A, B) _mm512_maskz_fcmul_pch(U, A, B) -#define _mm512_cmul_round_pch(A, B, R) _mm512_fcmul_round_pch(A, B, R) -#define _mm512_mask_cmul_round_pch(W, U, A, B, R) \ - _mm512_mask_fcmul_round_pch(W, U, A, B, R) -#define _mm512_maskz_cmul_round_pch(U, A, B, R) \ - _mm512_maskz_fcmul_round_pch(U, A, B, R) - -#define _mm_mul_sch(A, B) _mm_fmul_sch(A, B) -#define _mm_mask_mul_sch(W, U, A, B) _mm_mask_fmul_sch(W, U, A, B) -#define _mm_maskz_mul_sch(U, A, B) _mm_maskz_fmul_sch(U, A, B) -#define _mm_mul_round_sch(A, B, R) _mm_fmul_round_sch(A, B, R) -#define _mm_mask_mul_round_sch(W, U, A, B, R) \ - _mm_mask_fmul_round_sch(W, U, A, B, R) -#define _mm_maskz_mul_round_sch(U, A, B, R) _mm_maskz_fmul_round_sch(U, A, B, R) - -#define _mm_cmul_sch(A, B) _mm_fcmul_sch(A, B) -#define _mm_mask_cmul_sch(W, U, A, B) _mm_mask_fcmul_sch(W, U, A, B) -#define _mm_maskz_cmul_sch(U, A, B) _mm_maskz_fcmul_sch(U, A, B) -#define _mm_cmul_round_sch(A, B, R) _mm_fcmul_round_sch(A, B, R) -#define _mm_mask_cmul_round_sch(W, U, A, B, R) \ - _mm_mask_fcmul_round_sch(W, U, A, B, R) -#define _mm_maskz_cmul_round_sch(U, A, B, R) \ - _mm_maskz_fcmul_round_sch(U, A, B, R) - -#undef __DEFAULT_FN_ATTRS128 -#undef __DEFAULT_FN_ATTRS256 -#undef __DEFAULT_FN_ATTRS512 - -#endif diff --git a/include/avx512ifmaintrin.h b/include/avx512ifmaintrin.h deleted file mode 100644 index 5f7da52..0000000 --- a/include/avx512ifmaintrin.h +++ /dev/null @@ -1,68 +0,0 @@ -/*===------------- avx512ifmaintrin.h - IFMA intrinsics ------------------=== - * - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ -#ifndef __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __IFMAINTRIN_H -#define __IFMAINTRIN_H - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma"), __min_vector_width__(512))) - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_madd52hi_epu64 (__m512i __X, __m512i __Y, __m512i __Z) -{ - return (__m512i)__builtin_ia32_vpmadd52huq512((__v8di) __X, (__v8di) __Y, - (__v8di) __Z); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_madd52hi_epu64 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) -{ - return (__m512i)__builtin_ia32_selectq_512(__M, - (__v8di)_mm512_madd52hi_epu64(__W, __X, __Y), - (__v8di)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_madd52hi_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z) -{ - return (__m512i)__builtin_ia32_selectq_512(__M, - (__v8di)_mm512_madd52hi_epu64(__X, __Y, __Z), - (__v8di)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_madd52lo_epu64 (__m512i __X, __m512i __Y, __m512i __Z) -{ - return (__m512i)__builtin_ia32_vpmadd52luq512((__v8di) __X, (__v8di) __Y, - (__v8di) __Z); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_madd52lo_epu64 (__m512i __W, __mmask8 __M, __m512i __X, __m512i __Y) -{ - return (__m512i)__builtin_ia32_selectq_512(__M, - (__v8di)_mm512_madd52lo_epu64(__W, __X, __Y), - (__v8di)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_madd52lo_epu64 (__mmask8 __M, __m512i __X, __m512i __Y, __m512i __Z) -{ - return (__m512i)__builtin_ia32_selectq_512(__M, - (__v8di)_mm512_madd52lo_epu64(__X, __Y, __Z), - (__v8di)_mm512_setzero_si512()); -} - -#undef __DEFAULT_FN_ATTRS - -#endif diff --git a/include/avx512ifmavlintrin.h b/include/avx512ifmavlintrin.h deleted file mode 100644 index 5889401..0000000 --- a/include/avx512ifmavlintrin.h +++ /dev/null @@ -1,119 +0,0 @@ -/*===------------- avx512ifmavlintrin.h - IFMA intrinsics ------------------=== - * - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ -#ifndef __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __IFMAVLINTRIN_H -#define __IFMAVLINTRIN_H - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma,avx512vl"), __min_vector_width__(128))) -#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512ifma,avx512vl"), __min_vector_width__(256))) - - - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_madd52hi_epu64 (__m128i __X, __m128i __Y, __m128i __Z) -{ - return (__m128i)__builtin_ia32_vpmadd52huq128((__v2di) __X, (__v2di) __Y, - (__v2di) __Z); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_madd52hi_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) -{ - return (__m128i)__builtin_ia32_selectq_128(__M, - (__v2di)_mm_madd52hi_epu64(__W, __X, __Y), - (__v2di)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_madd52hi_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z) -{ - return (__m128i)__builtin_ia32_selectq_128(__M, - (__v2di)_mm_madd52hi_epu64(__X, __Y, __Z), - (__v2di)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_madd52hi_epu64 (__m256i __X, __m256i __Y, __m256i __Z) -{ - return (__m256i)__builtin_ia32_vpmadd52huq256((__v4di)__X, (__v4di)__Y, - (__v4di)__Z); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_madd52hi_epu64 (__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) -{ - return (__m256i)__builtin_ia32_selectq_256(__M, - (__v4di)_mm256_madd52hi_epu64(__W, __X, __Y), - (__v4di)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_madd52hi_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z) -{ - return (__m256i)__builtin_ia32_selectq_256(__M, - (__v4di)_mm256_madd52hi_epu64(__X, __Y, __Z), - (__v4di)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_madd52lo_epu64 (__m128i __X, __m128i __Y, __m128i __Z) -{ - return (__m128i)__builtin_ia32_vpmadd52luq128((__v2di)__X, (__v2di)__Y, - (__v2di)__Z); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_madd52lo_epu64 (__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) -{ - return (__m128i)__builtin_ia32_selectq_128(__M, - (__v2di)_mm_madd52lo_epu64(__W, __X, __Y), - (__v2di)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_madd52lo_epu64 (__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z) -{ - return (__m128i)__builtin_ia32_selectq_128(__M, - (__v2di)_mm_madd52lo_epu64(__X, __Y, __Z), - (__v2di)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_madd52lo_epu64 (__m256i __X, __m256i __Y, __m256i __Z) -{ - return (__m256i)__builtin_ia32_vpmadd52luq256((__v4di)__X, (__v4di)__Y, - (__v4di)__Z); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_madd52lo_epu64 (__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) -{ - return (__m256i)__builtin_ia32_selectq_256(__M, - (__v4di)_mm256_madd52lo_epu64(__W, __X, __Y), - (__v4di)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_madd52lo_epu64 (__mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z) -{ - return (__m256i)__builtin_ia32_selectq_256(__M, - (__v4di)_mm256_madd52lo_epu64(__X, __Y, __Z), - (__v4di)_mm256_setzero_si256()); -} - - -#undef __DEFAULT_FN_ATTRS128 -#undef __DEFAULT_FN_ATTRS256 - -#endif diff --git a/include/avx512pfintrin.h b/include/avx512pfintrin.h deleted file mode 100644 index b8bcf49..0000000 --- a/include/avx512pfintrin.h +++ /dev/null @@ -1,97 +0,0 @@ -/*===------------- avx512pfintrin.h - PF intrinsics ------------------------=== - * - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ -#ifndef __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __AVX512PFINTRIN_H -#define __AVX512PFINTRIN_H - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512pf"))) - -#define _mm512_mask_prefetch_i32gather_pd(index, mask, addr, scale, hint) \ - __builtin_ia32_gatherpfdpd((__mmask8)(mask), (__v8si)(__m256i)(index), \ - (void const *)(addr), (int)(scale), \ - (int)(hint)) - -#define _mm512_prefetch_i32gather_pd(index, addr, scale, hint) \ - __builtin_ia32_gatherpfdpd((__mmask8) -1, (__v8si)(__m256i)(index), \ - (void const *)(addr), (int)(scale), \ - (int)(hint)) - -#define _mm512_mask_prefetch_i32gather_ps(index, mask, addr, scale, hint) \ - __builtin_ia32_gatherpfdps((__mmask16)(mask), \ - (__v16si)(__m512i)(index), (void const *)(addr), \ - (int)(scale), (int)(hint)) - -#define _mm512_prefetch_i32gather_ps(index, addr, scale, hint) \ - __builtin_ia32_gatherpfdps((__mmask16) -1, \ - (__v16si)(__m512i)(index), (void const *)(addr), \ - (int)(scale), (int)(hint)) - -#define _mm512_mask_prefetch_i64gather_pd(index, mask, addr, scale, hint) \ - __builtin_ia32_gatherpfqpd((__mmask8)(mask), (__v8di)(__m512i)(index), \ - (void const *)(addr), (int)(scale), \ - (int)(hint)) - -#define _mm512_prefetch_i64gather_pd(index, addr, scale, hint) \ - __builtin_ia32_gatherpfqpd((__mmask8) -1, (__v8di)(__m512i)(index), \ - (void const *)(addr), (int)(scale), \ - (int)(hint)) - -#define _mm512_mask_prefetch_i64gather_ps(index, mask, addr, scale, hint) \ - __builtin_ia32_gatherpfqps((__mmask8)(mask), (__v8di)(__m512i)(index), \ - (void const *)(addr), (int)(scale), (int)(hint)) - -#define _mm512_prefetch_i64gather_ps(index, addr, scale, hint) \ - __builtin_ia32_gatherpfqps((__mmask8) -1, (__v8di)(__m512i)(index), \ - (void const *)(addr), (int)(scale), (int)(hint)) - -#define _mm512_prefetch_i32scatter_pd(addr, index, scale, hint) \ - __builtin_ia32_scatterpfdpd((__mmask8)-1, (__v8si)(__m256i)(index), \ - (void *)(addr), (int)(scale), \ - (int)(hint)) - -#define _mm512_mask_prefetch_i32scatter_pd(addr, mask, index, scale, hint) \ - __builtin_ia32_scatterpfdpd((__mmask8)(mask), (__v8si)(__m256i)(index), \ - (void *)(addr), (int)(scale), \ - (int)(hint)) - -#define _mm512_prefetch_i32scatter_ps(addr, index, scale, hint) \ - __builtin_ia32_scatterpfdps((__mmask16)-1, (__v16si)(__m512i)(index), \ - (void *)(addr), (int)(scale), (int)(hint)) - -#define _mm512_mask_prefetch_i32scatter_ps(addr, mask, index, scale, hint) \ - __builtin_ia32_scatterpfdps((__mmask16)(mask), \ - (__v16si)(__m512i)(index), (void *)(addr), \ - (int)(scale), (int)(hint)) - -#define _mm512_prefetch_i64scatter_pd(addr, index, scale, hint) \ - __builtin_ia32_scatterpfqpd((__mmask8)-1, (__v8di)(__m512i)(index), \ - (void *)(addr), (int)(scale), \ - (int)(hint)) - -#define _mm512_mask_prefetch_i64scatter_pd(addr, mask, index, scale, hint) \ - __builtin_ia32_scatterpfqpd((__mmask8)(mask), (__v8di)(__m512i)(index), \ - (void *)(addr), (int)(scale), \ - (int)(hint)) - -#define _mm512_prefetch_i64scatter_ps(addr, index, scale, hint) \ - __builtin_ia32_scatterpfqps((__mmask8)-1, (__v8di)(__m512i)(index), \ - (void *)(addr), (int)(scale), (int)(hint)) - -#define _mm512_mask_prefetch_i64scatter_ps(addr, mask, index, scale, hint) \ - __builtin_ia32_scatterpfqps((__mmask8)(mask), (__v8di)(__m512i)(index), \ - (void *)(addr), (int)(scale), (int)(hint)) - -#undef __DEFAULT_FN_ATTRS - -#endif diff --git a/include/avx512vbmi2intrin.h b/include/avx512vbmi2intrin.h deleted file mode 100644 index 17fa777..0000000 --- a/include/avx512vbmi2intrin.h +++ /dev/null @@ -1,357 +0,0 @@ -/*===------------- avx512vbmi2intrin.h - VBMI2 intrinsics ------------------=== - * - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ -#ifndef __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __AVX512VBMI2INTRIN_H -#define __AVX512VBMI2INTRIN_H - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi2"), __min_vector_width__(512))) - - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_compress_epi16(__m512i __S, __mmask32 __U, __m512i __D) -{ - return (__m512i) __builtin_ia32_compresshi512_mask ((__v32hi) __D, - (__v32hi) __S, - __U); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_compress_epi16(__mmask32 __U, __m512i __D) -{ - return (__m512i) __builtin_ia32_compresshi512_mask ((__v32hi) __D, - (__v32hi) _mm512_setzero_si512(), - __U); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_compress_epi8(__m512i __S, __mmask64 __U, __m512i __D) -{ - return (__m512i) __builtin_ia32_compressqi512_mask ((__v64qi) __D, - (__v64qi) __S, - __U); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_compress_epi8(__mmask64 __U, __m512i __D) -{ - return (__m512i) __builtin_ia32_compressqi512_mask ((__v64qi) __D, - (__v64qi) _mm512_setzero_si512(), - __U); -} - -static __inline__ void __DEFAULT_FN_ATTRS -_mm512_mask_compressstoreu_epi16(void *__P, __mmask32 __U, __m512i __D) -{ - __builtin_ia32_compressstorehi512_mask ((__v32hi *) __P, (__v32hi) __D, - __U); -} - -static __inline__ void __DEFAULT_FN_ATTRS -_mm512_mask_compressstoreu_epi8(void *__P, __mmask64 __U, __m512i __D) -{ - __builtin_ia32_compressstoreqi512_mask ((__v64qi *) __P, (__v64qi) __D, - __U); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_expand_epi16(__m512i __S, __mmask32 __U, __m512i __D) -{ - return (__m512i) __builtin_ia32_expandhi512_mask ((__v32hi) __D, - (__v32hi) __S, - __U); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_expand_epi16(__mmask32 __U, __m512i __D) -{ - return (__m512i) __builtin_ia32_expandhi512_mask ((__v32hi) __D, - (__v32hi) _mm512_setzero_si512(), - __U); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_expand_epi8(__m512i __S, __mmask64 __U, __m512i __D) -{ - return (__m512i) __builtin_ia32_expandqi512_mask ((__v64qi) __D, - (__v64qi) __S, - __U); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_expand_epi8(__mmask64 __U, __m512i __D) -{ - return (__m512i) __builtin_ia32_expandqi512_mask ((__v64qi) __D, - (__v64qi) _mm512_setzero_si512(), - __U); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_expandloadu_epi16(__m512i __S, __mmask32 __U, void const *__P) -{ - return (__m512i) __builtin_ia32_expandloadhi512_mask ((const __v32hi *)__P, - (__v32hi) __S, - __U); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_expandloadu_epi16(__mmask32 __U, void const *__P) -{ - return (__m512i) __builtin_ia32_expandloadhi512_mask ((const __v32hi *)__P, - (__v32hi) _mm512_setzero_si512(), - __U); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_expandloadu_epi8(__m512i __S, __mmask64 __U, void const *__P) -{ - return (__m512i) __builtin_ia32_expandloadqi512_mask ((const __v64qi *)__P, - (__v64qi) __S, - __U); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_expandloadu_epi8(__mmask64 __U, void const *__P) -{ - return (__m512i) __builtin_ia32_expandloadqi512_mask ((const __v64qi *)__P, - (__v64qi) _mm512_setzero_si512(), - __U); -} - -#define _mm512_shldi_epi64(A, B, I) \ - ((__m512i)__builtin_ia32_vpshldq512((__v8di)(__m512i)(A), \ - (__v8di)(__m512i)(B), (int)(I))) - -#define _mm512_mask_shldi_epi64(S, U, A, B, I) \ - ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ - (__v8di)_mm512_shldi_epi64((A), (B), (I)), \ - (__v8di)(__m512i)(S))) - -#define _mm512_maskz_shldi_epi64(U, A, B, I) \ - ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ - (__v8di)_mm512_shldi_epi64((A), (B), (I)), \ - (__v8di)_mm512_setzero_si512())) - -#define _mm512_shldi_epi32(A, B, I) \ - ((__m512i)__builtin_ia32_vpshldd512((__v16si)(__m512i)(A), \ - (__v16si)(__m512i)(B), (int)(I))) - -#define _mm512_mask_shldi_epi32(S, U, A, B, I) \ - ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ - (__v16si)_mm512_shldi_epi32((A), (B), (I)), \ - (__v16si)(__m512i)(S))) - -#define _mm512_maskz_shldi_epi32(U, A, B, I) \ - ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ - (__v16si)_mm512_shldi_epi32((A), (B), (I)), \ - (__v16si)_mm512_setzero_si512())) - -#define _mm512_shldi_epi16(A, B, I) \ - ((__m512i)__builtin_ia32_vpshldw512((__v32hi)(__m512i)(A), \ - (__v32hi)(__m512i)(B), (int)(I))) - -#define _mm512_mask_shldi_epi16(S, U, A, B, I) \ - ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ - (__v32hi)_mm512_shldi_epi16((A), (B), (I)), \ - (__v32hi)(__m512i)(S))) - -#define _mm512_maskz_shldi_epi16(U, A, B, I) \ - ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ - (__v32hi)_mm512_shldi_epi16((A), (B), (I)), \ - (__v32hi)_mm512_setzero_si512())) - -#define _mm512_shrdi_epi64(A, B, I) \ - ((__m512i)__builtin_ia32_vpshrdq512((__v8di)(__m512i)(A), \ - (__v8di)(__m512i)(B), (int)(I))) - -#define _mm512_mask_shrdi_epi64(S, U, A, B, I) \ - ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ - (__v8di)_mm512_shrdi_epi64((A), (B), (I)), \ - (__v8di)(__m512i)(S))) - -#define _mm512_maskz_shrdi_epi64(U, A, B, I) \ - ((__m512i)__builtin_ia32_selectq_512((__mmask8)(U), \ - (__v8di)_mm512_shrdi_epi64((A), (B), (I)), \ - (__v8di)_mm512_setzero_si512())) - -#define _mm512_shrdi_epi32(A, B, I) \ - ((__m512i)__builtin_ia32_vpshrdd512((__v16si)(__m512i)(A), \ - (__v16si)(__m512i)(B), (int)(I))) - -#define _mm512_mask_shrdi_epi32(S, U, A, B, I) \ - ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ - (__v16si)_mm512_shrdi_epi32((A), (B), (I)), \ - (__v16si)(__m512i)(S))) - -#define _mm512_maskz_shrdi_epi32(U, A, B, I) \ - ((__m512i)__builtin_ia32_selectd_512((__mmask16)(U), \ - (__v16si)_mm512_shrdi_epi32((A), (B), (I)), \ - (__v16si)_mm512_setzero_si512())) - -#define _mm512_shrdi_epi16(A, B, I) \ - ((__m512i)__builtin_ia32_vpshrdw512((__v32hi)(__m512i)(A), \ - (__v32hi)(__m512i)(B), (int)(I))) - -#define _mm512_mask_shrdi_epi16(S, U, A, B, I) \ - ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ - (__v32hi)_mm512_shrdi_epi16((A), (B), (I)), \ - (__v32hi)(__m512i)(S))) - -#define _mm512_maskz_shrdi_epi16(U, A, B, I) \ - ((__m512i)__builtin_ia32_selectw_512((__mmask32)(U), \ - (__v32hi)_mm512_shrdi_epi16((A), (B), (I)), \ - (__v32hi)_mm512_setzero_si512())) - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_shldv_epi64(__m512i __A, __m512i __B, __m512i __C) -{ - return (__m512i)__builtin_ia32_vpshldvq512((__v8di)__A, (__v8di)__B, - (__v8di)__C); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_shldv_epi64(__m512i __A, __mmask8 __U, __m512i __B, __m512i __C) -{ - return (__m512i)__builtin_ia32_selectq_512(__U, - (__v8di)_mm512_shldv_epi64(__A, __B, __C), - (__v8di)__A); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_shldv_epi64(__mmask8 __U, __m512i __A, __m512i __B, __m512i __C) -{ - return (__m512i)__builtin_ia32_selectq_512(__U, - (__v8di)_mm512_shldv_epi64(__A, __B, __C), - (__v8di)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_shldv_epi32(__m512i __A, __m512i __B, __m512i __C) -{ - return (__m512i)__builtin_ia32_vpshldvd512((__v16si)__A, (__v16si)__B, - (__v16si)__C); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_shldv_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) -{ - return (__m512i)__builtin_ia32_selectd_512(__U, - (__v16si)_mm512_shldv_epi32(__A, __B, __C), - (__v16si)__A); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_shldv_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C) -{ - return (__m512i)__builtin_ia32_selectd_512(__U, - (__v16si)_mm512_shldv_epi32(__A, __B, __C), - (__v16si)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_shldv_epi16(__m512i __A, __m512i __B, __m512i __C) -{ - return (__m512i)__builtin_ia32_vpshldvw512((__v32hi)__A, (__v32hi)__B, - (__v32hi)__C); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_shldv_epi16(__m512i __A, __mmask32 __U, __m512i __B, __m512i __C) -{ - return (__m512i)__builtin_ia32_selectw_512(__U, - (__v32hi)_mm512_shldv_epi16(__A, __B, __C), - (__v32hi)__A); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_shldv_epi16(__mmask32 __U, __m512i __A, __m512i __B, __m512i __C) -{ - return (__m512i)__builtin_ia32_selectw_512(__U, - (__v32hi)_mm512_shldv_epi16(__A, __B, __C), - (__v32hi)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_shrdv_epi64(__m512i __A, __m512i __B, __m512i __C) -{ - return (__m512i)__builtin_ia32_vpshrdvq512((__v8di)__A, (__v8di)__B, - (__v8di)__C); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_shrdv_epi64(__m512i __A, __mmask8 __U, __m512i __B, __m512i __C) -{ - return (__m512i)__builtin_ia32_selectq_512(__U, - (__v8di)_mm512_shrdv_epi64(__A, __B, __C), - (__v8di)__A); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_shrdv_epi64(__mmask8 __U, __m512i __A, __m512i __B, __m512i __C) -{ - return (__m512i)__builtin_ia32_selectq_512(__U, - (__v8di)_mm512_shrdv_epi64(__A, __B, __C), - (__v8di)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_shrdv_epi32(__m512i __A, __m512i __B, __m512i __C) -{ - return (__m512i)__builtin_ia32_vpshrdvd512((__v16si)__A, (__v16si)__B, - (__v16si)__C); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_shrdv_epi32(__m512i __A, __mmask16 __U, __m512i __B, __m512i __C) -{ - return (__m512i) __builtin_ia32_selectd_512(__U, - (__v16si)_mm512_shrdv_epi32(__A, __B, __C), - (__v16si)__A); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_shrdv_epi32(__mmask16 __U, __m512i __A, __m512i __B, __m512i __C) -{ - return (__m512i) __builtin_ia32_selectd_512(__U, - (__v16si)_mm512_shrdv_epi32(__A, __B, __C), - (__v16si)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_shrdv_epi16(__m512i __A, __m512i __B, __m512i __C) -{ - return (__m512i)__builtin_ia32_vpshrdvw512((__v32hi)__A, (__v32hi)__B, - (__v32hi)__C); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_shrdv_epi16(__m512i __A, __mmask32 __U, __m512i __B, __m512i __C) -{ - return (__m512i)__builtin_ia32_selectw_512(__U, - (__v32hi)_mm512_shrdv_epi16(__A, __B, __C), - (__v32hi)__A); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_shrdv_epi16(__mmask32 __U, __m512i __A, __m512i __B, __m512i __C) -{ - return (__m512i)__builtin_ia32_selectw_512(__U, - (__v32hi)_mm512_shrdv_epi16(__A, __B, __C), - (__v32hi)_mm512_setzero_si512()); -} - - -#undef __DEFAULT_FN_ATTRS - -#endif - diff --git a/include/avx512vbmiintrin.h b/include/avx512vbmiintrin.h deleted file mode 100644 index c0e0f94..0000000 --- a/include/avx512vbmiintrin.h +++ /dev/null @@ -1,105 +0,0 @@ -/*===------------- avx512vbmiintrin.h - VBMI intrinsics ------------------=== - * - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ -#ifndef __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __VBMIINTRIN_H -#define __VBMIINTRIN_H - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi"), __min_vector_width__(512))) - - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_permutex2var_epi8(__m512i __A, __m512i __I, __m512i __B) -{ - return (__m512i)__builtin_ia32_vpermi2varqi512((__v64qi)__A, (__v64qi)__I, - (__v64qi) __B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_permutex2var_epi8(__m512i __A, __mmask64 __U, __m512i __I, - __m512i __B) -{ - return (__m512i)__builtin_ia32_selectb_512(__U, - (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B), - (__v64qi)__A); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask2_permutex2var_epi8(__m512i __A, __m512i __I, __mmask64 __U, - __m512i __B) -{ - return (__m512i)__builtin_ia32_selectb_512(__U, - (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B), - (__v64qi)__I); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_permutex2var_epi8(__mmask64 __U, __m512i __A, __m512i __I, - __m512i __B) -{ - return (__m512i)__builtin_ia32_selectb_512(__U, - (__v64qi)_mm512_permutex2var_epi8(__A, __I, __B), - (__v64qi)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_permutexvar_epi8 (__m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_permvarqi512((__v64qi) __B, (__v64qi) __A); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_permutexvar_epi8 (__mmask64 __M, __m512i __A, - __m512i __B) -{ - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, - (__v64qi)_mm512_permutexvar_epi8(__A, __B), - (__v64qi)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_permutexvar_epi8 (__m512i __W, __mmask64 __M, __m512i __A, - __m512i __B) -{ - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, - (__v64qi)_mm512_permutexvar_epi8(__A, __B), - (__v64qi)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_multishift_epi64_epi8(__m512i __X, __m512i __Y) -{ - return (__m512i)__builtin_ia32_vpmultishiftqb512((__v64qi)__X, (__v64qi) __Y); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_multishift_epi64_epi8(__m512i __W, __mmask64 __M, __m512i __X, - __m512i __Y) -{ - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, - (__v64qi)_mm512_multishift_epi64_epi8(__X, __Y), - (__v64qi)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_multishift_epi64_epi8(__mmask64 __M, __m512i __X, __m512i __Y) -{ - return (__m512i)__builtin_ia32_selectb_512((__mmask64)__M, - (__v64qi)_mm512_multishift_epi64_epi8(__X, __Y), - (__v64qi)_mm512_setzero_si512()); -} - - -#undef __DEFAULT_FN_ATTRS - -#endif diff --git a/include/avx512vbmivlintrin.h b/include/avx512vbmivlintrin.h deleted file mode 100644 index c5b96ae..0000000 --- a/include/avx512vbmivlintrin.h +++ /dev/null @@ -1,188 +0,0 @@ -/*===------------- avx512vbmivlintrin.h - VBMI intrinsics ------------------=== - * - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ -#ifndef __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __VBMIVLINTRIN_H -#define __VBMIVLINTRIN_H - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi,avx512vl"), __min_vector_width__(128))) -#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vbmi,avx512vl"), __min_vector_width__(256))) - - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_permutex2var_epi8(__m128i __A, __m128i __I, __m128i __B) -{ - return (__m128i)__builtin_ia32_vpermi2varqi128((__v16qi)__A, - (__v16qi)__I, - (__v16qi)__B); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_permutex2var_epi8(__m128i __A, __mmask16 __U, __m128i __I, - __m128i __B) -{ - return (__m128i)__builtin_ia32_selectb_128(__U, - (__v16qi)_mm_permutex2var_epi8(__A, __I, __B), - (__v16qi)__A); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask2_permutex2var_epi8(__m128i __A, __m128i __I, __mmask16 __U, - __m128i __B) -{ - return (__m128i)__builtin_ia32_selectb_128(__U, - (__v16qi)_mm_permutex2var_epi8(__A, __I, __B), - (__v16qi)__I); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_permutex2var_epi8(__mmask16 __U, __m128i __A, __m128i __I, - __m128i __B) -{ - return (__m128i)__builtin_ia32_selectb_128(__U, - (__v16qi)_mm_permutex2var_epi8(__A, __I, __B), - (__v16qi)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_permutex2var_epi8(__m256i __A, __m256i __I, __m256i __B) -{ - return (__m256i)__builtin_ia32_vpermi2varqi256((__v32qi)__A, (__v32qi)__I, - (__v32qi)__B); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_permutex2var_epi8(__m256i __A, __mmask32 __U, __m256i __I, - __m256i __B) -{ - return (__m256i)__builtin_ia32_selectb_256(__U, - (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B), - (__v32qi)__A); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask2_permutex2var_epi8(__m256i __A, __m256i __I, __mmask32 __U, - __m256i __B) -{ - return (__m256i)__builtin_ia32_selectb_256(__U, - (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B), - (__v32qi)__I); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_permutex2var_epi8(__mmask32 __U, __m256i __A, __m256i __I, - __m256i __B) -{ - return (__m256i)__builtin_ia32_selectb_256(__U, - (__v32qi)_mm256_permutex2var_epi8(__A, __I, __B), - (__v32qi)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_permutexvar_epi8 (__m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_permvarqi128((__v16qi)__B, (__v16qi)__A); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_permutexvar_epi8 (__mmask16 __M, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, - (__v16qi)_mm_permutexvar_epi8(__A, __B), - (__v16qi)_mm_setzero_si128()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_permutexvar_epi8 (__m128i __W, __mmask16 __M, __m128i __A, - __m128i __B) -{ - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, - (__v16qi)_mm_permutexvar_epi8(__A, __B), - (__v16qi)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_permutexvar_epi8 (__m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_permvarqi256((__v32qi) __B, (__v32qi) __A); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_permutexvar_epi8 (__mmask32 __M, __m256i __A, - __m256i __B) -{ - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, - (__v32qi)_mm256_permutexvar_epi8(__A, __B), - (__v32qi)_mm256_setzero_si256()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_permutexvar_epi8 (__m256i __W, __mmask32 __M, __m256i __A, - __m256i __B) -{ - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, - (__v32qi)_mm256_permutexvar_epi8(__A, __B), - (__v32qi)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_multishift_epi64_epi8(__m128i __X, __m128i __Y) -{ - return (__m128i)__builtin_ia32_vpmultishiftqb128((__v16qi)__X, (__v16qi)__Y); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_multishift_epi64_epi8(__m128i __W, __mmask16 __M, __m128i __X, - __m128i __Y) -{ - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, - (__v16qi)_mm_multishift_epi64_epi8(__X, __Y), - (__v16qi)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_multishift_epi64_epi8(__mmask16 __M, __m128i __X, __m128i __Y) -{ - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, - (__v16qi)_mm_multishift_epi64_epi8(__X, __Y), - (__v16qi)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_multishift_epi64_epi8(__m256i __X, __m256i __Y) -{ - return (__m256i)__builtin_ia32_vpmultishiftqb256((__v32qi)__X, (__v32qi)__Y); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_multishift_epi64_epi8(__m256i __W, __mmask32 __M, __m256i __X, - __m256i __Y) -{ - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, - (__v32qi)_mm256_multishift_epi64_epi8(__X, __Y), - (__v32qi)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_multishift_epi64_epi8(__mmask32 __M, __m256i __X, __m256i __Y) -{ - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, - (__v32qi)_mm256_multishift_epi64_epi8(__X, __Y), - (__v32qi)_mm256_setzero_si256()); -} - - -#undef __DEFAULT_FN_ATTRS128 -#undef __DEFAULT_FN_ATTRS256 - -#endif diff --git a/include/avx512vlbf16intrin.h b/include/avx512vlbf16intrin.h deleted file mode 100644 index adc43c1..0000000 --- a/include/avx512vlbf16intrin.h +++ /dev/null @@ -1,530 +0,0 @@ -/*===--------- avx512vlbf16intrin.h - AVX512_BF16 intrinsics ---------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ -#ifndef __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __AVX512VLBF16INTRIN_H -#define __AVX512VLBF16INTRIN_H - -#if (__clang_major__ <= 15) -typedef short __m128bh __attribute__((__vector_size__(16), __aligned__(16))); -#endif - -#define __DEFAULT_FN_ATTRS128 \ - __attribute__((__always_inline__, __nodebug__, \ - __target__("avx512vl, avx512bf16"), __min_vector_width__(128))) -#define __DEFAULT_FN_ATTRS256 \ - __attribute__((__always_inline__, __nodebug__, \ - __target__("avx512vl, avx512bf16"), __min_vector_width__(256))) - -/// Convert Two Packed Single Data to One Packed BF16 Data. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCVTNE2PS2BF16 instructions. -/// -/// \param __A -/// A 128-bit vector of [4 x float]. -/// \param __B -/// A 128-bit vector of [4 x float]. -/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from -/// conversion of __B, and higher 64 bits come from conversion of __A. -static __inline__ __m128bh __DEFAULT_FN_ATTRS128 -_mm_cvtne2ps_pbh(__m128 __A, __m128 __B) { - return (__m128bh)__builtin_ia32_cvtne2ps2bf16_128((__v4sf) __A, - (__v4sf) __B); -} - -/// Convert Two Packed Single Data to One Packed BF16 Data. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCVTNE2PS2BF16 instructions. -/// -/// \param __A -/// A 128-bit vector of [4 x float]. -/// \param __B -/// A 128-bit vector of [4 x float]. -/// \param __W -/// A 128-bit vector of [8 x bfloat]. -/// \param __U -/// A 8-bit mask value specifying what is chosen for each element. -/// A 1 means conversion of __A or __B. A 0 means element from __W. -/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from -/// conversion of __B, and higher 64 bits come from conversion of __A. -static __inline__ __m128bh __DEFAULT_FN_ATTRS128 -_mm_mask_cvtne2ps_pbh(__m128bh __W, __mmask8 __U, __m128 __A, __m128 __B) { - return (__m128bh)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_cvtne2ps_pbh(__A, __B), - (__v8hi)__W); -} - -/// Convert Two Packed Single Data to One Packed BF16 Data. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCVTNE2PS2BF16 instructions. -/// -/// \param __A -/// A 128-bit vector of [4 x float]. -/// \param __B -/// A 128-bit vector of [4 x float]. -/// \param __U -/// A 8-bit mask value specifying what is chosen for each element. -/// A 1 means conversion of __A or __B. A 0 means element is zero. -/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from -/// conversion of __B, and higher 64 bits come from conversion of __A. -static __inline__ __m128bh __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtne2ps_pbh(__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128bh)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_cvtne2ps_pbh(__A, __B), - (__v8hi)_mm_setzero_si128()); -} - -/// Convert Two Packed Single Data to One Packed BF16 Data. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCVTNE2PS2BF16 instructions. -/// -/// \param __A -/// A 256-bit vector of [8 x float]. -/// \param __B -/// A 256-bit vector of [8 x float]. -/// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from -/// conversion of __B, and higher 128 bits come from conversion of __A. -static __inline__ __m256bh __DEFAULT_FN_ATTRS256 -_mm256_cvtne2ps_pbh(__m256 __A, __m256 __B) { - return (__m256bh)__builtin_ia32_cvtne2ps2bf16_256((__v8sf) __A, - (__v8sf) __B); -} - -/// Convert Two Packed Single Data to One Packed BF16 Data. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCVTNE2PS2BF16 instructions. -/// -/// \param __A -/// A 256-bit vector of [8 x float]. -/// \param __B -/// A 256-bit vector of [8 x float]. -/// \param __W -/// A 256-bit vector of [16 x bfloat]. -/// \param __U -/// A 16-bit mask value specifying what is chosen for each element. -/// A 1 means conversion of __A or __B. A 0 means element from __W. -/// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from -/// conversion of __B, and higher 128 bits come from conversion of __A. -static __inline__ __m256bh __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtne2ps_pbh(__m256bh __W, __mmask16 __U, __m256 __A, __m256 __B) { - return (__m256bh)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_cvtne2ps_pbh(__A, __B), - (__v16hi)__W); -} - -/// Convert Two Packed Single Data to One Packed BF16 Data. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCVTNE2PS2BF16 instructions. -/// -/// \param __A -/// A 256-bit vector of [8 x float]. -/// \param __B -/// A 256-bit vector of [8 x float]. -/// \param __U -/// A 16-bit mask value specifying what is chosen for each element. -/// A 1 means conversion of __A or __B. A 0 means element is zero. -/// \returns A 256-bit vector of [16 x bfloat] whose lower 128 bits come from -/// conversion of __B, and higher 128 bits come from conversion of __A. -static __inline__ __m256bh __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtne2ps_pbh(__mmask16 __U, __m256 __A, __m256 __B) { - return (__m256bh)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_cvtne2ps_pbh(__A, __B), - (__v16hi)_mm256_setzero_si256()); -} - -/// Convert Packed Single Data to Packed BF16 Data. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCVTNEPS2BF16 instructions. -/// -/// \param __A -/// A 128-bit vector of [4 x float]. -/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from -/// conversion of __A, and higher 64 bits are 0. -static __inline__ __m128bh __DEFAULT_FN_ATTRS128 -_mm_cvtneps_pbh(__m128 __A) { - return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A, - (__v8hi)_mm_undefined_si128(), - (__mmask8)-1); -} - -/// Convert Packed Single Data to Packed BF16 Data. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCVTNEPS2BF16 instructions. -/// -/// \param __A -/// A 128-bit vector of [4 x float]. -/// \param __W -/// A 128-bit vector of [8 x bfloat]. -/// \param __U -/// A 4-bit mask value specifying what is chosen for each element. -/// A 1 means conversion of __A. A 0 means element from __W. -/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from -/// conversion of __A, and higher 64 bits are 0. -static __inline__ __m128bh __DEFAULT_FN_ATTRS128 -_mm_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m128 __A) { - return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A, - (__v8hi)__W, - (__mmask8)__U); -} - -/// Convert Packed Single Data to Packed BF16 Data. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCVTNEPS2BF16 instructions. -/// -/// \param __A -/// A 128-bit vector of [4 x float]. -/// \param __U -/// A 4-bit mask value specifying what is chosen for each element. -/// A 1 means conversion of __A. A 0 means element is zero. -/// \returns A 128-bit vector of [8 x bfloat] whose lower 64 bits come from -/// conversion of __A, and higher 64 bits are 0. -static __inline__ __m128bh __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtneps_pbh(__mmask8 __U, __m128 __A) { - return (__m128bh)__builtin_ia32_cvtneps2bf16_128_mask((__v4sf) __A, - (__v8hi)_mm_setzero_si128(), - (__mmask8)__U); -} - -/// Convert Packed Single Data to Packed BF16 Data. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCVTNEPS2BF16 instructions. -/// -/// \param __A -/// A 256-bit vector of [8 x float]. -/// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A. -static __inline__ __m128bh __DEFAULT_FN_ATTRS256 -_mm256_cvtneps_pbh(__m256 __A) { - return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A, - (__v8hi)_mm_undefined_si128(), - (__mmask8)-1); -} - -/// Convert Packed Single Data to Packed BF16 Data. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCVTNEPS2BF16 instructions. -/// -/// \param __A -/// A 256-bit vector of [8 x float]. -/// \param __W -/// A 256-bit vector of [8 x bfloat]. -/// \param __U -/// A 8-bit mask value specifying what is chosen for each element. -/// A 1 means conversion of __A. A 0 means element from __W. -/// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A. -static __inline__ __m128bh __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtneps_pbh(__m128bh __W, __mmask8 __U, __m256 __A) { - return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A, - (__v8hi)__W, - (__mmask8)__U); -} - -/// Convert Packed Single Data to Packed BF16 Data. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCVTNEPS2BF16 instructions. -/// -/// \param __A -/// A 256-bit vector of [8 x float]. -/// \param __U -/// A 8-bit mask value specifying what is chosen for each element. -/// A 1 means conversion of __A. A 0 means element is zero. -/// \returns A 128-bit vector of [8 x bfloat] comes from conversion of __A. -static __inline__ __m128bh __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtneps_pbh(__mmask8 __U, __m256 __A) { - return (__m128bh)__builtin_ia32_cvtneps2bf16_256_mask((__v8sf)__A, - (__v8hi)_mm_setzero_si128(), - (__mmask8)__U); -} - -/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VDPBF16PS instructions. -/// -/// \param __A -/// A 128-bit vector of [8 x bfloat]. -/// \param __B -/// A 128-bit vector of [8 x bfloat]. -/// \param __D -/// A 128-bit vector of [4 x float]. -/// \returns A 128-bit vector of [4 x float] comes from Dot Product of -/// __A, __B and __D -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_dpbf16_ps(__m128 __D, __m128bh __A, __m128bh __B) { - return (__m128)__builtin_ia32_dpbf16ps_128((__v4sf)__D, - (__v4si)__A, - (__v4si)__B); -} - -/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VDPBF16PS instructions. -/// -/// \param __A -/// A 128-bit vector of [8 x bfloat]. -/// \param __B -/// A 128-bit vector of [8 x bfloat]. -/// \param __D -/// A 128-bit vector of [4 x float]. -/// \param __U -/// A 8-bit mask value specifying what is chosen for each element. -/// A 1 means __A and __B's dot product accumulated with __D. A 0 means __D. -/// \returns A 128-bit vector of [4 x float] comes from Dot Product of -/// __A, __B and __D -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_dpbf16_ps(__m128 __D, __mmask8 __U, __m128bh __A, __m128bh __B) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_dpbf16_ps(__D, __A, __B), - (__v4sf)__D); -} - -/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VDPBF16PS instructions. -/// -/// \param __A -/// A 128-bit vector of [8 x bfloat]. -/// \param __B -/// A 128-bit vector of [8 x bfloat]. -/// \param __D -/// A 128-bit vector of [4 x float]. -/// \param __U -/// A 8-bit mask value specifying what is chosen for each element. -/// A 1 means __A and __B's dot product accumulated with __D. A 0 means 0. -/// \returns A 128-bit vector of [4 x float] comes from Dot Product of -/// __A, __B and __D -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_dpbf16_ps(__mmask8 __U, __m128 __D, __m128bh __A, __m128bh __B) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_dpbf16_ps(__D, __A, __B), - (__v4sf)_mm_setzero_si128()); -} - -/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VDPBF16PS instructions. -/// -/// \param __A -/// A 256-bit vector of [16 x bfloat]. -/// \param __B -/// A 256-bit vector of [16 x bfloat]. -/// \param __D -/// A 256-bit vector of [8 x float]. -/// \returns A 256-bit vector of [8 x float] comes from Dot Product of -/// __A, __B and __D -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_dpbf16_ps(__m256 __D, __m256bh __A, __m256bh __B) { - return (__m256)__builtin_ia32_dpbf16ps_256((__v8sf)__D, - (__v8si)__A, - (__v8si)__B); -} - -/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VDPBF16PS instructions. -/// -/// \param __A -/// A 256-bit vector of [16 x bfloat]. -/// \param __B -/// A 256-bit vector of [16 x bfloat]. -/// \param __D -/// A 256-bit vector of [8 x float]. -/// \param __U -/// A 16-bit mask value specifying what is chosen for each element. -/// A 1 means __A and __B's dot product accumulated with __D. A 0 means __D. -/// \returns A 256-bit vector of [8 x float] comes from Dot Product of -/// __A, __B and __D -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_mask_dpbf16_ps(__m256 __D, __mmask8 __U, __m256bh __A, __m256bh __B) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_dpbf16_ps(__D, __A, __B), - (__v8sf)__D); -} - -/// Dot Product of BF16 Pairs Accumulated into Packed Single Precision. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VDPBF16PS instructions. -/// -/// \param __A -/// A 256-bit vector of [16 x bfloat]. -/// \param __B -/// A 256-bit vector of [16 x bfloat]. -/// \param __D -/// A 256-bit vector of [8 x float]. -/// \param __U -/// A 8-bit mask value specifying what is chosen for each element. -/// A 1 means __A and __B's dot product accumulated with __D. A 0 means 0. -/// \returns A 256-bit vector of [8 x float] comes from Dot Product of -/// __A, __B and __D -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_maskz_dpbf16_ps(__mmask8 __U, __m256 __D, __m256bh __A, __m256bh __B) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_dpbf16_ps(__D, __A, __B), - (__v8sf)_mm256_setzero_si256()); -} - -/// Convert One Single float Data to One BF16 Data. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCVTNEPS2BF16 instructions. -/// -/// \param __A -/// A float data. -/// \returns A bf16 data whose sign field and exponent field keep unchanged, -/// and fraction field is truncated to 7 bits. -static __inline__ __bfloat16 __DEFAULT_FN_ATTRS128 _mm_cvtness_sbh(float __A) { - __v4sf __V = {__A, 0, 0, 0}; -#if (__clang_major__ > 15) - __v8bf __R = __builtin_ia32_cvtneps2bf16_128_mask( - (__v4sf)__V, (__v8bf)_mm_undefined_si128(), (__mmask8)-1); - return (__bf16)__R[0]; -#else - __v8hi __R = __builtin_ia32_cvtneps2bf16_128_mask( - (__v4sf)__V, (__v8hi)_mm_undefined_si128(), (__mmask8)-1); - return __R[0]; -#endif -} - -/// Convert Packed BF16 Data to Packed float Data. -/// -/// \headerfile -/// -/// \param __A -/// A 128-bit vector of [4 x bfloat]. -/// \returns A 128-bit vector of [4 x float] come from conversion of __A -static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtpbh_ps(__m128bh __A) { - return _mm_castsi128_ps( - (__m128i)_mm_slli_epi32((__m128i)_mm_cvtepi16_epi32((__m128i)__A), 16)); -} - -/// Convert Packed BF16 Data to Packed float Data. -/// -/// \headerfile -/// -/// \param __A -/// A 128-bit vector of [8 x bfloat]. -/// \returns A 256-bit vector of [8 x float] come from conversion of __A -static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtpbh_ps(__m128bh __A) { - return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32( - (__m256i)_mm256_cvtepi16_epi32((__m128i)__A), 16)); -} - -/// Convert Packed BF16 Data to Packed float Data using zeroing mask. -/// -/// \headerfile -/// -/// \param __U -/// A 4-bit mask. Elements are zeroed out when the corresponding mask -/// bit is not set. -/// \param __A -/// A 128-bit vector of [4 x bfloat]. -/// \returns A 128-bit vector of [4 x float] come from conversion of __A -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) { - return _mm_castsi128_ps((__m128i)_mm_slli_epi32( - (__m128i)_mm_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16)); -} - -/// Convert Packed BF16 Data to Packed float Data using zeroing mask. -/// -/// \headerfile -/// -/// \param __U -/// A 8-bit mask. Elements are zeroed out when the corresponding mask -/// bit is not set. -/// \param __A -/// A 128-bit vector of [8 x bfloat]. -/// \returns A 256-bit vector of [8 x float] come from conversion of __A -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtpbh_ps(__mmask8 __U, __m128bh __A) { - return _mm256_castsi256_ps((__m256i)_mm256_slli_epi32( - (__m256i)_mm256_maskz_cvtepi16_epi32((__mmask8)__U, (__m128i)__A), 16)); -} - -/// Convert Packed BF16 Data to Packed float Data using merging mask. -/// -/// \headerfile -/// -/// \param __S -/// A 128-bit vector of [4 x float]. Elements are copied from __S when -/// the corresponding mask bit is not set. -/// \param __U -/// A 4-bit mask. Elements are zeroed out when the corresponding mask -/// bit is not set. -/// \param __A -/// A 128-bit vector of [4 x bfloat]. -/// \returns A 128-bit vector of [4 x float] come from conversion of __A -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_cvtpbh_ps(__m128 __S, __mmask8 __U, __m128bh __A) { - return _mm_castsi128_ps((__m128i)_mm_mask_slli_epi32( - (__m128i)__S, (__mmask8)__U, (__m128i)_mm_cvtepi16_epi32((__m128i)__A), - 16)); -} - -/// Convert Packed BF16 Data to Packed float Data using merging mask. -/// -/// \headerfile -/// -/// \param __S -/// A 256-bit vector of [8 x float]. Elements are copied from __S when -/// the corresponding mask bit is not set. -/// \param __U -/// A 8-bit mask. Elements are zeroed out when the corresponding mask -/// bit is not set. -/// \param __A -/// A 128-bit vector of [8 x bfloat]. -/// \returns A 256-bit vector of [8 x float] come from conversion of __A -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtpbh_ps(__m256 __S, __mmask8 __U, __m128bh __A) { - return _mm256_castsi256_ps((__m256i)_mm256_mask_slli_epi32( - (__m256i)__S, (__mmask8)__U, (__m256i)_mm256_cvtepi16_epi32((__m128i)__A), - 16)); -} - -#undef __DEFAULT_FN_ATTRS128 -#undef __DEFAULT_FN_ATTRS256 - -#endif diff --git a/include/avx512vlbitalgintrin.h b/include/avx512vlbitalgintrin.h deleted file mode 100644 index 5154eae..0000000 --- a/include/avx512vlbitalgintrin.h +++ /dev/null @@ -1,145 +0,0 @@ -/*===---- avx512vlbitalgintrin.h - BITALG intrinsics -----------------------=== - * - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ -#ifndef __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __AVX512VLBITALGINTRIN_H -#define __AVX512VLBITALGINTRIN_H - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512bitalg"), __min_vector_width__(128))) -#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512bitalg"), __min_vector_width__(256))) - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_popcnt_epi16(__m256i __A) -{ - return (__m256i) __builtin_ia32_vpopcntw_256((__v16hi) __A); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_popcnt_epi16(__m256i __A, __mmask16 __U, __m256i __B) -{ - return (__m256i) __builtin_ia32_selectw_256((__mmask16) __U, - (__v16hi) _mm256_popcnt_epi16(__B), - (__v16hi) __A); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_popcnt_epi16(__mmask16 __U, __m256i __B) -{ - return _mm256_mask_popcnt_epi16((__m256i) _mm256_setzero_si256(), - __U, - __B); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_popcnt_epi16(__m128i __A) -{ - return (__m128i) __builtin_ia32_vpopcntw_128((__v8hi) __A); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_popcnt_epi16(__m128i __A, __mmask8 __U, __m128i __B) -{ - return (__m128i) __builtin_ia32_selectw_128((__mmask8) __U, - (__v8hi) _mm_popcnt_epi16(__B), - (__v8hi) __A); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_popcnt_epi16(__mmask8 __U, __m128i __B) -{ - return _mm_mask_popcnt_epi16((__m128i) _mm_setzero_si128(), - __U, - __B); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_popcnt_epi8(__m256i __A) -{ - return (__m256i) __builtin_ia32_vpopcntb_256((__v32qi) __A); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_popcnt_epi8(__m256i __A, __mmask32 __U, __m256i __B) -{ - return (__m256i) __builtin_ia32_selectb_256((__mmask32) __U, - (__v32qi) _mm256_popcnt_epi8(__B), - (__v32qi) __A); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_popcnt_epi8(__mmask32 __U, __m256i __B) -{ - return _mm256_mask_popcnt_epi8((__m256i) _mm256_setzero_si256(), - __U, - __B); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_popcnt_epi8(__m128i __A) -{ - return (__m128i) __builtin_ia32_vpopcntb_128((__v16qi) __A); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_popcnt_epi8(__m128i __A, __mmask16 __U, __m128i __B) -{ - return (__m128i) __builtin_ia32_selectb_128((__mmask16) __U, - (__v16qi) _mm_popcnt_epi8(__B), - (__v16qi) __A); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_popcnt_epi8(__mmask16 __U, __m128i __B) -{ - return _mm_mask_popcnt_epi8((__m128i) _mm_setzero_si128(), - __U, - __B); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS256 -_mm256_mask_bitshuffle_epi64_mask(__mmask32 __U, __m256i __A, __m256i __B) -{ - return (__mmask32) __builtin_ia32_vpshufbitqmb256_mask((__v32qi) __A, - (__v32qi) __B, - __U); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS256 -_mm256_bitshuffle_epi64_mask(__m256i __A, __m256i __B) -{ - return _mm256_mask_bitshuffle_epi64_mask((__mmask32) -1, - __A, - __B); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS128 -_mm_mask_bitshuffle_epi64_mask(__mmask16 __U, __m128i __A, __m128i __B) -{ - return (__mmask16) __builtin_ia32_vpshufbitqmb128_mask((__v16qi) __A, - (__v16qi) __B, - __U); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS128 -_mm_bitshuffle_epi64_mask(__m128i __A, __m128i __B) -{ - return _mm_mask_bitshuffle_epi64_mask((__mmask16) -1, - __A, - __B); -} - - -#undef __DEFAULT_FN_ATTRS128 -#undef __DEFAULT_FN_ATTRS256 - -#endif diff --git a/include/avx512vlbwintrin.h b/include/avx512vlbwintrin.h deleted file mode 100644 index 7873516..0000000 --- a/include/avx512vlbwintrin.h +++ /dev/null @@ -1,2809 +0,0 @@ -/*===---- avx512vlbwintrin.h - AVX512VL and AVX512BW intrinsics ------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ - -#ifndef __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __AVX512VLBWINTRIN_H -#define __AVX512VLBWINTRIN_H - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512bw"), __min_vector_width__(128))) -#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512bw"), __min_vector_width__(256))) - -/* Integer compare */ - -#define _mm_cmp_epi8_mask(a, b, p) \ - ((__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \ - (__v16qi)(__m128i)(b), (int)(p), \ - (__mmask16)-1)) - -#define _mm_mask_cmp_epi8_mask(m, a, b, p) \ - ((__mmask16)__builtin_ia32_cmpb128_mask((__v16qi)(__m128i)(a), \ - (__v16qi)(__m128i)(b), (int)(p), \ - (__mmask16)(m))) - -#define _mm_cmp_epu8_mask(a, b, p) \ - ((__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \ - (__v16qi)(__m128i)(b), (int)(p), \ - (__mmask16)-1)) - -#define _mm_mask_cmp_epu8_mask(m, a, b, p) \ - ((__mmask16)__builtin_ia32_ucmpb128_mask((__v16qi)(__m128i)(a), \ - (__v16qi)(__m128i)(b), (int)(p), \ - (__mmask16)(m))) - -#define _mm256_cmp_epi8_mask(a, b, p) \ - ((__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \ - (__v32qi)(__m256i)(b), (int)(p), \ - (__mmask32)-1)) - -#define _mm256_mask_cmp_epi8_mask(m, a, b, p) \ - ((__mmask32)__builtin_ia32_cmpb256_mask((__v32qi)(__m256i)(a), \ - (__v32qi)(__m256i)(b), (int)(p), \ - (__mmask32)(m))) - -#define _mm256_cmp_epu8_mask(a, b, p) \ - ((__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \ - (__v32qi)(__m256i)(b), (int)(p), \ - (__mmask32)-1)) - -#define _mm256_mask_cmp_epu8_mask(m, a, b, p) \ - ((__mmask32)__builtin_ia32_ucmpb256_mask((__v32qi)(__m256i)(a), \ - (__v32qi)(__m256i)(b), (int)(p), \ - (__mmask32)(m))) - -#define _mm_cmp_epi16_mask(a, b, p) \ - ((__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \ - (__v8hi)(__m128i)(b), (int)(p), \ - (__mmask8)-1)) - -#define _mm_mask_cmp_epi16_mask(m, a, b, p) \ - ((__mmask8)__builtin_ia32_cmpw128_mask((__v8hi)(__m128i)(a), \ - (__v8hi)(__m128i)(b), (int)(p), \ - (__mmask8)(m))) - -#define _mm_cmp_epu16_mask(a, b, p) \ - ((__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \ - (__v8hi)(__m128i)(b), (int)(p), \ - (__mmask8)-1)) - -#define _mm_mask_cmp_epu16_mask(m, a, b, p) \ - ((__mmask8)__builtin_ia32_ucmpw128_mask((__v8hi)(__m128i)(a), \ - (__v8hi)(__m128i)(b), (int)(p), \ - (__mmask8)(m))) - -#define _mm256_cmp_epi16_mask(a, b, p) \ - ((__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \ - (__v16hi)(__m256i)(b), (int)(p), \ - (__mmask16)-1)) - -#define _mm256_mask_cmp_epi16_mask(m, a, b, p) \ - ((__mmask16)__builtin_ia32_cmpw256_mask((__v16hi)(__m256i)(a), \ - (__v16hi)(__m256i)(b), (int)(p), \ - (__mmask16)(m))) - -#define _mm256_cmp_epu16_mask(a, b, p) \ - ((__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \ - (__v16hi)(__m256i)(b), (int)(p), \ - (__mmask16)-1)) - -#define _mm256_mask_cmp_epu16_mask(m, a, b, p) \ - ((__mmask16)__builtin_ia32_ucmpw256_mask((__v16hi)(__m256i)(a), \ - (__v16hi)(__m256i)(b), (int)(p), \ - (__mmask16)(m))) - -#define _mm_cmpeq_epi8_mask(A, B) \ - _mm_cmp_epi8_mask((A), (B), _MM_CMPINT_EQ) -#define _mm_mask_cmpeq_epi8_mask(k, A, B) \ - _mm_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_EQ) -#define _mm_cmpge_epi8_mask(A, B) \ - _mm_cmp_epi8_mask((A), (B), _MM_CMPINT_GE) -#define _mm_mask_cmpge_epi8_mask(k, A, B) \ - _mm_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_GE) -#define _mm_cmpgt_epi8_mask(A, B) \ - _mm_cmp_epi8_mask((A), (B), _MM_CMPINT_GT) -#define _mm_mask_cmpgt_epi8_mask(k, A, B) \ - _mm_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_GT) -#define _mm_cmple_epi8_mask(A, B) \ - _mm_cmp_epi8_mask((A), (B), _MM_CMPINT_LE) -#define _mm_mask_cmple_epi8_mask(k, A, B) \ - _mm_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_LE) -#define _mm_cmplt_epi8_mask(A, B) \ - _mm_cmp_epi8_mask((A), (B), _MM_CMPINT_LT) -#define _mm_mask_cmplt_epi8_mask(k, A, B) \ - _mm_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_LT) -#define _mm_cmpneq_epi8_mask(A, B) \ - _mm_cmp_epi8_mask((A), (B), _MM_CMPINT_NE) -#define _mm_mask_cmpneq_epi8_mask(k, A, B) \ - _mm_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_NE) - -#define _mm256_cmpeq_epi8_mask(A, B) \ - _mm256_cmp_epi8_mask((A), (B), _MM_CMPINT_EQ) -#define _mm256_mask_cmpeq_epi8_mask(k, A, B) \ - _mm256_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_EQ) -#define _mm256_cmpge_epi8_mask(A, B) \ - _mm256_cmp_epi8_mask((A), (B), _MM_CMPINT_GE) -#define _mm256_mask_cmpge_epi8_mask(k, A, B) \ - _mm256_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_GE) -#define _mm256_cmpgt_epi8_mask(A, B) \ - _mm256_cmp_epi8_mask((A), (B), _MM_CMPINT_GT) -#define _mm256_mask_cmpgt_epi8_mask(k, A, B) \ - _mm256_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_GT) -#define _mm256_cmple_epi8_mask(A, B) \ - _mm256_cmp_epi8_mask((A), (B), _MM_CMPINT_LE) -#define _mm256_mask_cmple_epi8_mask(k, A, B) \ - _mm256_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_LE) -#define _mm256_cmplt_epi8_mask(A, B) \ - _mm256_cmp_epi8_mask((A), (B), _MM_CMPINT_LT) -#define _mm256_mask_cmplt_epi8_mask(k, A, B) \ - _mm256_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_LT) -#define _mm256_cmpneq_epi8_mask(A, B) \ - _mm256_cmp_epi8_mask((A), (B), _MM_CMPINT_NE) -#define _mm256_mask_cmpneq_epi8_mask(k, A, B) \ - _mm256_mask_cmp_epi8_mask((k), (A), (B), _MM_CMPINT_NE) - -#define _mm_cmpeq_epu8_mask(A, B) \ - _mm_cmp_epu8_mask((A), (B), _MM_CMPINT_EQ) -#define _mm_mask_cmpeq_epu8_mask(k, A, B) \ - _mm_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_EQ) -#define _mm_cmpge_epu8_mask(A, B) \ - _mm_cmp_epu8_mask((A), (B), _MM_CMPINT_GE) -#define _mm_mask_cmpge_epu8_mask(k, A, B) \ - _mm_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_GE) -#define _mm_cmpgt_epu8_mask(A, B) \ - _mm_cmp_epu8_mask((A), (B), _MM_CMPINT_GT) -#define _mm_mask_cmpgt_epu8_mask(k, A, B) \ - _mm_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_GT) -#define _mm_cmple_epu8_mask(A, B) \ - _mm_cmp_epu8_mask((A), (B), _MM_CMPINT_LE) -#define _mm_mask_cmple_epu8_mask(k, A, B) \ - _mm_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_LE) -#define _mm_cmplt_epu8_mask(A, B) \ - _mm_cmp_epu8_mask((A), (B), _MM_CMPINT_LT) -#define _mm_mask_cmplt_epu8_mask(k, A, B) \ - _mm_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_LT) -#define _mm_cmpneq_epu8_mask(A, B) \ - _mm_cmp_epu8_mask((A), (B), _MM_CMPINT_NE) -#define _mm_mask_cmpneq_epu8_mask(k, A, B) \ - _mm_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_NE) - -#define _mm256_cmpeq_epu8_mask(A, B) \ - _mm256_cmp_epu8_mask((A), (B), _MM_CMPINT_EQ) -#define _mm256_mask_cmpeq_epu8_mask(k, A, B) \ - _mm256_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_EQ) -#define _mm256_cmpge_epu8_mask(A, B) \ - _mm256_cmp_epu8_mask((A), (B), _MM_CMPINT_GE) -#define _mm256_mask_cmpge_epu8_mask(k, A, B) \ - _mm256_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_GE) -#define _mm256_cmpgt_epu8_mask(A, B) \ - _mm256_cmp_epu8_mask((A), (B), _MM_CMPINT_GT) -#define _mm256_mask_cmpgt_epu8_mask(k, A, B) \ - _mm256_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_GT) -#define _mm256_cmple_epu8_mask(A, B) \ - _mm256_cmp_epu8_mask((A), (B), _MM_CMPINT_LE) -#define _mm256_mask_cmple_epu8_mask(k, A, B) \ - _mm256_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_LE) -#define _mm256_cmplt_epu8_mask(A, B) \ - _mm256_cmp_epu8_mask((A), (B), _MM_CMPINT_LT) -#define _mm256_mask_cmplt_epu8_mask(k, A, B) \ - _mm256_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_LT) -#define _mm256_cmpneq_epu8_mask(A, B) \ - _mm256_cmp_epu8_mask((A), (B), _MM_CMPINT_NE) -#define _mm256_mask_cmpneq_epu8_mask(k, A, B) \ - _mm256_mask_cmp_epu8_mask((k), (A), (B), _MM_CMPINT_NE) - -#define _mm_cmpeq_epi16_mask(A, B) \ - _mm_cmp_epi16_mask((A), (B), _MM_CMPINT_EQ) -#define _mm_mask_cmpeq_epi16_mask(k, A, B) \ - _mm_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_EQ) -#define _mm_cmpge_epi16_mask(A, B) \ - _mm_cmp_epi16_mask((A), (B), _MM_CMPINT_GE) -#define _mm_mask_cmpge_epi16_mask(k, A, B) \ - _mm_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_GE) -#define _mm_cmpgt_epi16_mask(A, B) \ - _mm_cmp_epi16_mask((A), (B), _MM_CMPINT_GT) -#define _mm_mask_cmpgt_epi16_mask(k, A, B) \ - _mm_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_GT) -#define _mm_cmple_epi16_mask(A, B) \ - _mm_cmp_epi16_mask((A), (B), _MM_CMPINT_LE) -#define _mm_mask_cmple_epi16_mask(k, A, B) \ - _mm_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_LE) -#define _mm_cmplt_epi16_mask(A, B) \ - _mm_cmp_epi16_mask((A), (B), _MM_CMPINT_LT) -#define _mm_mask_cmplt_epi16_mask(k, A, B) \ - _mm_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_LT) -#define _mm_cmpneq_epi16_mask(A, B) \ - _mm_cmp_epi16_mask((A), (B), _MM_CMPINT_NE) -#define _mm_mask_cmpneq_epi16_mask(k, A, B) \ - _mm_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_NE) - -#define _mm256_cmpeq_epi16_mask(A, B) \ - _mm256_cmp_epi16_mask((A), (B), _MM_CMPINT_EQ) -#define _mm256_mask_cmpeq_epi16_mask(k, A, B) \ - _mm256_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_EQ) -#define _mm256_cmpge_epi16_mask(A, B) \ - _mm256_cmp_epi16_mask((A), (B), _MM_CMPINT_GE) -#define _mm256_mask_cmpge_epi16_mask(k, A, B) \ - _mm256_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_GE) -#define _mm256_cmpgt_epi16_mask(A, B) \ - _mm256_cmp_epi16_mask((A), (B), _MM_CMPINT_GT) -#define _mm256_mask_cmpgt_epi16_mask(k, A, B) \ - _mm256_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_GT) -#define _mm256_cmple_epi16_mask(A, B) \ - _mm256_cmp_epi16_mask((A), (B), _MM_CMPINT_LE) -#define _mm256_mask_cmple_epi16_mask(k, A, B) \ - _mm256_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_LE) -#define _mm256_cmplt_epi16_mask(A, B) \ - _mm256_cmp_epi16_mask((A), (B), _MM_CMPINT_LT) -#define _mm256_mask_cmplt_epi16_mask(k, A, B) \ - _mm256_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_LT) -#define _mm256_cmpneq_epi16_mask(A, B) \ - _mm256_cmp_epi16_mask((A), (B), _MM_CMPINT_NE) -#define _mm256_mask_cmpneq_epi16_mask(k, A, B) \ - _mm256_mask_cmp_epi16_mask((k), (A), (B), _MM_CMPINT_NE) - -#define _mm_cmpeq_epu16_mask(A, B) \ - _mm_cmp_epu16_mask((A), (B), _MM_CMPINT_EQ) -#define _mm_mask_cmpeq_epu16_mask(k, A, B) \ - _mm_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_EQ) -#define _mm_cmpge_epu16_mask(A, B) \ - _mm_cmp_epu16_mask((A), (B), _MM_CMPINT_GE) -#define _mm_mask_cmpge_epu16_mask(k, A, B) \ - _mm_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_GE) -#define _mm_cmpgt_epu16_mask(A, B) \ - _mm_cmp_epu16_mask((A), (B), _MM_CMPINT_GT) -#define _mm_mask_cmpgt_epu16_mask(k, A, B) \ - _mm_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_GT) -#define _mm_cmple_epu16_mask(A, B) \ - _mm_cmp_epu16_mask((A), (B), _MM_CMPINT_LE) -#define _mm_mask_cmple_epu16_mask(k, A, B) \ - _mm_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_LE) -#define _mm_cmplt_epu16_mask(A, B) \ - _mm_cmp_epu16_mask((A), (B), _MM_CMPINT_LT) -#define _mm_mask_cmplt_epu16_mask(k, A, B) \ - _mm_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_LT) -#define _mm_cmpneq_epu16_mask(A, B) \ - _mm_cmp_epu16_mask((A), (B), _MM_CMPINT_NE) -#define _mm_mask_cmpneq_epu16_mask(k, A, B) \ - _mm_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_NE) - -#define _mm256_cmpeq_epu16_mask(A, B) \ - _mm256_cmp_epu16_mask((A), (B), _MM_CMPINT_EQ) -#define _mm256_mask_cmpeq_epu16_mask(k, A, B) \ - _mm256_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_EQ) -#define _mm256_cmpge_epu16_mask(A, B) \ - _mm256_cmp_epu16_mask((A), (B), _MM_CMPINT_GE) -#define _mm256_mask_cmpge_epu16_mask(k, A, B) \ - _mm256_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_GE) -#define _mm256_cmpgt_epu16_mask(A, B) \ - _mm256_cmp_epu16_mask((A), (B), _MM_CMPINT_GT) -#define _mm256_mask_cmpgt_epu16_mask(k, A, B) \ - _mm256_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_GT) -#define _mm256_cmple_epu16_mask(A, B) \ - _mm256_cmp_epu16_mask((A), (B), _MM_CMPINT_LE) -#define _mm256_mask_cmple_epu16_mask(k, A, B) \ - _mm256_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_LE) -#define _mm256_cmplt_epu16_mask(A, B) \ - _mm256_cmp_epu16_mask((A), (B), _MM_CMPINT_LT) -#define _mm256_mask_cmplt_epu16_mask(k, A, B) \ - _mm256_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_LT) -#define _mm256_cmpneq_epu16_mask(A, B) \ - _mm256_cmp_epu16_mask((A), (B), _MM_CMPINT_NE) -#define _mm256_mask_cmpneq_epu16_mask(k, A, B) \ - _mm256_mask_cmp_epu16_mask((k), (A), (B), _MM_CMPINT_NE) - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_add_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B){ - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, - (__v32qi)_mm256_add_epi8(__A, __B), - (__v32qi)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_add_epi8(__mmask32 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, - (__v32qi)_mm256_add_epi8(__A, __B), - (__v32qi)_mm256_setzero_si256()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_add_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_add_epi16(__A, __B), - (__v16hi)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_add_epi16(__mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_add_epi16(__A, __B), - (__v16hi)_mm256_setzero_si256()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_sub_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, - (__v32qi)_mm256_sub_epi8(__A, __B), - (__v32qi)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_sub_epi8(__mmask32 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, - (__v32qi)_mm256_sub_epi8(__A, __B), - (__v32qi)_mm256_setzero_si256()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_sub_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_sub_epi16(__A, __B), - (__v16hi)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_sub_epi16(__mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_sub_epi16(__A, __B), - (__v16hi)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_add_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, - (__v16qi)_mm_add_epi8(__A, __B), - (__v16qi)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_add_epi8(__mmask16 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, - (__v16qi)_mm_add_epi8(__A, __B), - (__v16qi)_mm_setzero_si128()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_add_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_add_epi16(__A, __B), - (__v8hi)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_add_epi16(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_add_epi16(__A, __B), - (__v8hi)_mm_setzero_si128()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_sub_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, - (__v16qi)_mm_sub_epi8(__A, __B), - (__v16qi)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_sub_epi8(__mmask16 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, - (__v16qi)_mm_sub_epi8(__A, __B), - (__v16qi)_mm_setzero_si128()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_sub_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_sub_epi16(__A, __B), - (__v8hi)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_sub_epi16(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_sub_epi16(__A, __B), - (__v8hi)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_mullo_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_mullo_epi16(__A, __B), - (__v16hi)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_mullo_epi16(__mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_mullo_epi16(__A, __B), - (__v16hi)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_mullo_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_mullo_epi16(__A, __B), - (__v8hi)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_mullo_epi16(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_mullo_epi16(__A, __B), - (__v8hi)_mm_setzero_si128()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_blend_epi8 (__mmask16 __U, __m128i __A, __m128i __W) -{ - return (__m128i) __builtin_ia32_selectb_128 ((__mmask16) __U, - (__v16qi) __W, - (__v16qi) __A); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_blend_epi8 (__mmask32 __U, __m256i __A, __m256i __W) -{ - return (__m256i) __builtin_ia32_selectb_256 ((__mmask32) __U, - (__v32qi) __W, - (__v32qi) __A); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_blend_epi16 (__mmask8 __U, __m128i __A, __m128i __W) -{ - return (__m128i) __builtin_ia32_selectw_128 ((__mmask8) __U, - (__v8hi) __W, - (__v8hi) __A); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_blend_epi16 (__mmask16 __U, __m256i __A, __m256i __W) -{ - return (__m256i) __builtin_ia32_selectw_256 ((__mmask16) __U, - (__v16hi) __W, - (__v16hi) __A); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_abs_epi8(__m128i __W, __mmask16 __U, __m128i __A) -{ - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, - (__v16qi)_mm_abs_epi8(__A), - (__v16qi)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_abs_epi8(__mmask16 __U, __m128i __A) -{ - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, - (__v16qi)_mm_abs_epi8(__A), - (__v16qi)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_abs_epi8(__m256i __W, __mmask32 __U, __m256i __A) -{ - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, - (__v32qi)_mm256_abs_epi8(__A), - (__v32qi)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_abs_epi8 (__mmask32 __U, __m256i __A) -{ - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, - (__v32qi)_mm256_abs_epi8(__A), - (__v32qi)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_abs_epi16(__m128i __W, __mmask8 __U, __m128i __A) -{ - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_abs_epi16(__A), - (__v8hi)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_abs_epi16(__mmask8 __U, __m128i __A) -{ - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_abs_epi16(__A), - (__v8hi)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_abs_epi16(__m256i __W, __mmask16 __U, __m256i __A) -{ - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_abs_epi16(__A), - (__v16hi)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_abs_epi16(__mmask16 __U, __m256i __A) -{ - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_abs_epi16(__A), - (__v16hi)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_packs_epi32(__mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, - (__v8hi)_mm_packs_epi32(__A, __B), - (__v8hi)_mm_setzero_si128()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_packs_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, - (__v8hi)_mm_packs_epi32(__A, __B), - (__v8hi)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_packs_epi32(__mmask16 __M, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, - (__v16hi)_mm256_packs_epi32(__A, __B), - (__v16hi)_mm256_setzero_si256()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_packs_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, - (__v16hi)_mm256_packs_epi32(__A, __B), - (__v16hi)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_packs_epi16(__mmask16 __M, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, - (__v16qi)_mm_packs_epi16(__A, __B), - (__v16qi)_mm_setzero_si128()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_packs_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, - (__v16qi)_mm_packs_epi16(__A, __B), - (__v16qi)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_packs_epi16(__mmask32 __M, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, - (__v32qi)_mm256_packs_epi16(__A, __B), - (__v32qi)_mm256_setzero_si256()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_packs_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, - (__v32qi)_mm256_packs_epi16(__A, __B), - (__v32qi)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_packus_epi32(__mmask8 __M, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, - (__v8hi)_mm_packus_epi32(__A, __B), - (__v8hi)_mm_setzero_si128()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_packus_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, - (__v8hi)_mm_packus_epi32(__A, __B), - (__v8hi)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_packus_epi32(__mmask16 __M, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, - (__v16hi)_mm256_packus_epi32(__A, __B), - (__v16hi)_mm256_setzero_si256()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_packus_epi32(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, - (__v16hi)_mm256_packus_epi32(__A, __B), - (__v16hi)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_packus_epi16(__mmask16 __M, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, - (__v16qi)_mm_packus_epi16(__A, __B), - (__v16qi)_mm_setzero_si128()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_packus_epi16(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, - (__v16qi)_mm_packus_epi16(__A, __B), - (__v16qi)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_packus_epi16(__mmask32 __M, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, - (__v32qi)_mm256_packus_epi16(__A, __B), - (__v32qi)_mm256_setzero_si256()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_packus_epi16(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, - (__v32qi)_mm256_packus_epi16(__A, __B), - (__v32qi)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_adds_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, - (__v16qi)_mm_adds_epi8(__A, __B), - (__v16qi)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_adds_epi8(__mmask16 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, - (__v16qi)_mm_adds_epi8(__A, __B), - (__v16qi)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_adds_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, - (__v32qi)_mm256_adds_epi8(__A, __B), - (__v32qi)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_adds_epi8(__mmask32 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, - (__v32qi)_mm256_adds_epi8(__A, __B), - (__v32qi)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_adds_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_adds_epi16(__A, __B), - (__v8hi)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_adds_epi16(__mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_adds_epi16(__A, __B), - (__v8hi)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_adds_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_adds_epi16(__A, __B), - (__v16hi)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_adds_epi16(__mmask16 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_adds_epi16(__A, __B), - (__v16hi)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_adds_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, - (__v16qi)_mm_adds_epu8(__A, __B), - (__v16qi)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_adds_epu8(__mmask16 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, - (__v16qi)_mm_adds_epu8(__A, __B), - (__v16qi)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_adds_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, - (__v32qi)_mm256_adds_epu8(__A, __B), - (__v32qi)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_adds_epu8(__mmask32 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, - (__v32qi)_mm256_adds_epu8(__A, __B), - (__v32qi)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_adds_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_adds_epu16(__A, __B), - (__v8hi)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_adds_epu16(__mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_adds_epu16(__A, __B), - (__v8hi)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_adds_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_adds_epu16(__A, __B), - (__v16hi)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_adds_epu16(__mmask16 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_adds_epu16(__A, __B), - (__v16hi)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_avg_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, - (__v16qi)_mm_avg_epu8(__A, __B), - (__v16qi)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_avg_epu8(__mmask16 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, - (__v16qi)_mm_avg_epu8(__A, __B), - (__v16qi)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_avg_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, - (__v32qi)_mm256_avg_epu8(__A, __B), - (__v32qi)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_avg_epu8(__mmask32 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, - (__v32qi)_mm256_avg_epu8(__A, __B), - (__v32qi)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_avg_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_avg_epu16(__A, __B), - (__v8hi)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_avg_epu16(__mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_avg_epu16(__A, __B), - (__v8hi)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_avg_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_avg_epu16(__A, __B), - (__v16hi)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_avg_epu16(__mmask16 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_avg_epu16(__A, __B), - (__v16hi)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_max_epi8(__mmask16 __M, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, - (__v16qi)_mm_max_epi8(__A, __B), - (__v16qi)_mm_setzero_si128()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_max_epi8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, - (__v16qi)_mm_max_epi8(__A, __B), - (__v16qi)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_max_epi8(__mmask32 __M, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, - (__v32qi)_mm256_max_epi8(__A, __B), - (__v32qi)_mm256_setzero_si256()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_max_epi8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, - (__v32qi)_mm256_max_epi8(__A, __B), - (__v32qi)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_max_epi16(__mmask8 __M, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, - (__v8hi)_mm_max_epi16(__A, __B), - (__v8hi)_mm_setzero_si128()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_max_epi16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, - (__v8hi)_mm_max_epi16(__A, __B), - (__v8hi)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_max_epi16(__mmask16 __M, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, - (__v16hi)_mm256_max_epi16(__A, __B), - (__v16hi)_mm256_setzero_si256()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_max_epi16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, - (__v16hi)_mm256_max_epi16(__A, __B), - (__v16hi)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_max_epu8(__mmask16 __M, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, - (__v16qi)_mm_max_epu8(__A, __B), - (__v16qi)_mm_setzero_si128()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_max_epu8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, - (__v16qi)_mm_max_epu8(__A, __B), - (__v16qi)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_max_epu8 (__mmask32 __M, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, - (__v32qi)_mm256_max_epu8(__A, __B), - (__v32qi)_mm256_setzero_si256()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_max_epu8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, - (__v32qi)_mm256_max_epu8(__A, __B), - (__v32qi)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_max_epu16(__mmask8 __M, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, - (__v8hi)_mm_max_epu16(__A, __B), - (__v8hi)_mm_setzero_si128()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_max_epu16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, - (__v8hi)_mm_max_epu16(__A, __B), - (__v8hi)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_max_epu16(__mmask16 __M, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, - (__v16hi)_mm256_max_epu16(__A, __B), - (__v16hi)_mm256_setzero_si256()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_max_epu16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, - (__v16hi)_mm256_max_epu16(__A, __B), - (__v16hi)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_min_epi8(__mmask16 __M, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, - (__v16qi)_mm_min_epi8(__A, __B), - (__v16qi)_mm_setzero_si128()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_min_epi8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, - (__v16qi)_mm_min_epi8(__A, __B), - (__v16qi)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_min_epi8(__mmask32 __M, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, - (__v32qi)_mm256_min_epi8(__A, __B), - (__v32qi)_mm256_setzero_si256()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_min_epi8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, - (__v32qi)_mm256_min_epi8(__A, __B), - (__v32qi)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_min_epi16(__mmask8 __M, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, - (__v8hi)_mm_min_epi16(__A, __B), - (__v8hi)_mm_setzero_si128()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_min_epi16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, - (__v8hi)_mm_min_epi16(__A, __B), - (__v8hi)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_min_epi16(__mmask16 __M, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, - (__v16hi)_mm256_min_epi16(__A, __B), - (__v16hi)_mm256_setzero_si256()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_min_epi16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, - (__v16hi)_mm256_min_epi16(__A, __B), - (__v16hi)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_min_epu8(__mmask16 __M, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, - (__v16qi)_mm_min_epu8(__A, __B), - (__v16qi)_mm_setzero_si128()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_min_epu8(__m128i __W, __mmask16 __M, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, - (__v16qi)_mm_min_epu8(__A, __B), - (__v16qi)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_min_epu8 (__mmask32 __M, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, - (__v32qi)_mm256_min_epu8(__A, __B), - (__v32qi)_mm256_setzero_si256()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_min_epu8(__m256i __W, __mmask32 __M, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__M, - (__v32qi)_mm256_min_epu8(__A, __B), - (__v32qi)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_min_epu16(__mmask8 __M, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, - (__v8hi)_mm_min_epu16(__A, __B), - (__v8hi)_mm_setzero_si128()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_min_epu16(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, - (__v8hi)_mm_min_epu16(__A, __B), - (__v8hi)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_min_epu16(__mmask16 __M, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, - (__v16hi)_mm256_min_epu16(__A, __B), - (__v16hi)_mm256_setzero_si256()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_min_epu16(__m256i __W, __mmask16 __M, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, - (__v16hi)_mm256_min_epu16(__A, __B), - (__v16hi)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_shuffle_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, - (__v16qi)_mm_shuffle_epi8(__A, __B), - (__v16qi)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_shuffle_epi8(__mmask16 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, - (__v16qi)_mm_shuffle_epi8(__A, __B), - (__v16qi)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_shuffle_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, - (__v32qi)_mm256_shuffle_epi8(__A, __B), - (__v32qi)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_shuffle_epi8(__mmask32 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, - (__v32qi)_mm256_shuffle_epi8(__A, __B), - (__v32qi)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_subs_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, - (__v16qi)_mm_subs_epi8(__A, __B), - (__v16qi)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_subs_epi8(__mmask16 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, - (__v16qi)_mm_subs_epi8(__A, __B), - (__v16qi)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_subs_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, - (__v32qi)_mm256_subs_epi8(__A, __B), - (__v32qi)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_subs_epi8(__mmask32 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, - (__v32qi)_mm256_subs_epi8(__A, __B), - (__v32qi)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_subs_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_subs_epi16(__A, __B), - (__v8hi)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_subs_epi16(__mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_subs_epi16(__A, __B), - (__v8hi)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_subs_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_subs_epi16(__A, __B), - (__v16hi)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_subs_epi16(__mmask16 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_subs_epi16(__A, __B), - (__v16hi)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_subs_epu8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, - (__v16qi)_mm_subs_epu8(__A, __B), - (__v16qi)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_subs_epu8(__mmask16 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, - (__v16qi)_mm_subs_epu8(__A, __B), - (__v16qi)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_subs_epu8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, - (__v32qi)_mm256_subs_epu8(__A, __B), - (__v32qi)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_subs_epu8(__mmask32 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, - (__v32qi)_mm256_subs_epu8(__A, __B), - (__v32qi)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_subs_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_subs_epu16(__A, __B), - (__v8hi)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_subs_epu16(__mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_subs_epu16(__A, __B), - (__v8hi)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_subs_epu16(__m256i __W, __mmask16 __U, __m256i __A, - __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_subs_epu16(__A, __B), - (__v16hi)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_subs_epu16(__mmask16 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_subs_epu16(__A, __B), - (__v16hi)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_permutex2var_epi16(__m128i __A, __m128i __I, __m128i __B) -{ - return (__m128i)__builtin_ia32_vpermi2varhi128((__v8hi)__A, (__v8hi)__I, - (__v8hi) __B); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_permutex2var_epi16(__m128i __A, __mmask8 __U, __m128i __I, - __m128i __B) -{ - return (__m128i)__builtin_ia32_selectw_128(__U, - (__v8hi)_mm_permutex2var_epi16(__A, __I, __B), - (__v8hi)__A); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask2_permutex2var_epi16(__m128i __A, __m128i __I, __mmask8 __U, - __m128i __B) -{ - return (__m128i)__builtin_ia32_selectw_128(__U, - (__v8hi)_mm_permutex2var_epi16(__A, __I, __B), - (__v8hi)__I); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_permutex2var_epi16 (__mmask8 __U, __m128i __A, __m128i __I, - __m128i __B) -{ - return (__m128i)__builtin_ia32_selectw_128(__U, - (__v8hi)_mm_permutex2var_epi16(__A, __I, __B), - (__v8hi)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_permutex2var_epi16(__m256i __A, __m256i __I, __m256i __B) -{ - return (__m256i)__builtin_ia32_vpermi2varhi256((__v16hi)__A, (__v16hi)__I, - (__v16hi)__B); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_permutex2var_epi16(__m256i __A, __mmask16 __U, __m256i __I, - __m256i __B) -{ - return (__m256i)__builtin_ia32_selectw_256(__U, - (__v16hi)_mm256_permutex2var_epi16(__A, __I, __B), - (__v16hi)__A); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask2_permutex2var_epi16(__m256i __A, __m256i __I, __mmask16 __U, - __m256i __B) -{ - return (__m256i)__builtin_ia32_selectw_256(__U, - (__v16hi)_mm256_permutex2var_epi16(__A, __I, __B), - (__v16hi)__I); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_permutex2var_epi16 (__mmask16 __U, __m256i __A, __m256i __I, - __m256i __B) -{ - return (__m256i)__builtin_ia32_selectw_256(__U, - (__v16hi)_mm256_permutex2var_epi16(__A, __I, __B), - (__v16hi)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_maddubs_epi16(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_maddubs_epi16(__X, __Y), - (__v8hi)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_maddubs_epi16(__mmask8 __U, __m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_maddubs_epi16(__X, __Y), - (__v8hi)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_maddubs_epi16(__m256i __W, __mmask16 __U, __m256i __X, - __m256i __Y) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_maddubs_epi16(__X, __Y), - (__v16hi)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_maddubs_epi16(__mmask16 __U, __m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_maddubs_epi16(__X, __Y), - (__v16hi)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_madd_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_madd_epi16(__A, __B), - (__v4si)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_madd_epi16(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_madd_epi16(__A, __B), - (__v4si)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_madd_epi16(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_madd_epi16(__A, __B), - (__v8si)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_madd_epi16(__mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_madd_epi16(__A, __B), - (__v8si)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_cvtsepi16_epi8 (__m128i __A) { - return (__m128i) __builtin_ia32_pmovswb128_mask ((__v8hi) __A, - (__v16qi) _mm_setzero_si128(), - (__mmask8) -1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvtsepi16_epi8 (__m128i __O, __mmask8 __M, __m128i __A) { - return (__m128i) __builtin_ia32_pmovswb128_mask ((__v8hi) __A, - (__v16qi) __O, - __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtsepi16_epi8 (__mmask8 __M, __m128i __A) { - return (__m128i) __builtin_ia32_pmovswb128_mask ((__v8hi) __A, - (__v16qi) _mm_setzero_si128(), - __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_cvtsepi16_epi8 (__m256i __A) { - return (__m128i) __builtin_ia32_pmovswb256_mask ((__v16hi) __A, - (__v16qi) _mm_setzero_si128(), - (__mmask16) -1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtsepi16_epi8 (__m128i __O, __mmask16 __M, __m256i __A) { - return (__m128i) __builtin_ia32_pmovswb256_mask ((__v16hi) __A, - (__v16qi) __O, - __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtsepi16_epi8 (__mmask16 __M, __m256i __A) { - return (__m128i) __builtin_ia32_pmovswb256_mask ((__v16hi) __A, - (__v16qi) _mm_setzero_si128(), - __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_cvtusepi16_epi8 (__m128i __A) { - return (__m128i) __builtin_ia32_pmovuswb128_mask ((__v8hi) __A, - (__v16qi) _mm_setzero_si128(), - (__mmask8) -1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvtusepi16_epi8 (__m128i __O, __mmask8 __M, __m128i __A) { - return (__m128i) __builtin_ia32_pmovuswb128_mask ((__v8hi) __A, - (__v16qi) __O, - __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtusepi16_epi8 (__mmask8 __M, __m128i __A) { - return (__m128i) __builtin_ia32_pmovuswb128_mask ((__v8hi) __A, - (__v16qi) _mm_setzero_si128(), - __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_cvtusepi16_epi8 (__m256i __A) { - return (__m128i) __builtin_ia32_pmovuswb256_mask ((__v16hi) __A, - (__v16qi) _mm_setzero_si128(), - (__mmask16) -1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtusepi16_epi8 (__m128i __O, __mmask16 __M, __m256i __A) { - return (__m128i) __builtin_ia32_pmovuswb256_mask ((__v16hi) __A, - (__v16qi) __O, - __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtusepi16_epi8 (__mmask16 __M, __m256i __A) { - return (__m128i) __builtin_ia32_pmovuswb256_mask ((__v16hi) __A, - (__v16qi) _mm_setzero_si128(), - __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_cvtepi16_epi8 (__m128i __A) { - return (__m128i)__builtin_shufflevector( - __builtin_convertvector((__v8hi)__A, __v8qi), - (__v8qi){0, 0, 0, 0, 0, 0, 0, 0}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, - 12, 13, 14, 15); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvtepi16_epi8 (__m128i __O, __mmask8 __M, __m128i __A) { - return (__m128i) __builtin_ia32_pmovwb128_mask ((__v8hi) __A, - (__v16qi) __O, - __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtepi16_epi8 (__mmask8 __M, __m128i __A) { - return (__m128i) __builtin_ia32_pmovwb128_mask ((__v8hi) __A, - (__v16qi) _mm_setzero_si128(), - __M); -} - -static __inline__ void __DEFAULT_FN_ATTRS128 -_mm_mask_cvtepi16_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A) -{ - __builtin_ia32_pmovwb128mem_mask ((__v16qi *) __P, (__v8hi) __A, __M); -} - - -static __inline__ void __DEFAULT_FN_ATTRS128 -_mm_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A) -{ - __builtin_ia32_pmovswb128mem_mask ((__v16qi *) __P, (__v8hi) __A, __M); -} - -static __inline__ void __DEFAULT_FN_ATTRS128 -_mm_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A) -{ - __builtin_ia32_pmovuswb128mem_mask ((__v16qi *) __P, (__v8hi) __A, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_cvtepi16_epi8 (__m256i __A) { - return (__m128i)__builtin_convertvector((__v16hi) __A, __v16qi); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtepi16_epi8 (__m128i __O, __mmask16 __M, __m256i __A) { - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, - (__v16qi)_mm256_cvtepi16_epi8(__A), - (__v16qi)__O); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtepi16_epi8 (__mmask16 __M, __m256i __A) { - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__M, - (__v16qi)_mm256_cvtepi16_epi8(__A), - (__v16qi)_mm_setzero_si128()); -} - -static __inline__ void __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtepi16_storeu_epi8 (void * __P, __mmask16 __M, __m256i __A) -{ - __builtin_ia32_pmovwb256mem_mask ((__v16qi *) __P, (__v16hi) __A, __M); -} - -static __inline__ void __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtsepi16_storeu_epi8 (void * __P, __mmask16 __M, __m256i __A) -{ - __builtin_ia32_pmovswb256mem_mask ((__v16qi *) __P, (__v16hi) __A, __M); -} - -static __inline__ void __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtusepi16_storeu_epi8 (void * __P, __mmask16 __M, __m256i __A) -{ - __builtin_ia32_pmovuswb256mem_mask ((__v16qi*) __P, (__v16hi) __A, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_mulhrs_epi16(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_mulhrs_epi16(__X, __Y), - (__v8hi)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_mulhrs_epi16(__mmask8 __U, __m128i __X, __m128i __Y) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_mulhrs_epi16(__X, __Y), - (__v8hi)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_mulhrs_epi16(__m256i __W, __mmask16 __U, __m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_mulhrs_epi16(__X, __Y), - (__v16hi)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_mulhrs_epi16(__mmask16 __U, __m256i __X, __m256i __Y) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_mulhrs_epi16(__X, __Y), - (__v16hi)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_mulhi_epu16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_mulhi_epu16(__A, __B), - (__v8hi)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_mulhi_epu16(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_mulhi_epu16(__A, __B), - (__v8hi)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_mulhi_epu16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_mulhi_epu16(__A, __B), - (__v16hi)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_mulhi_epu16(__mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_mulhi_epu16(__A, __B), - (__v16hi)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_mulhi_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_mulhi_epi16(__A, __B), - (__v8hi)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_mulhi_epi16(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_mulhi_epi16(__A, __B), - (__v8hi)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_mulhi_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_mulhi_epi16(__A, __B), - (__v16hi)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_mulhi_epi16(__mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_mulhi_epi16(__A, __B), - (__v16hi)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_unpackhi_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, - (__v16qi)_mm_unpackhi_epi8(__A, __B), - (__v16qi)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_unpackhi_epi8(__mmask16 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, - (__v16qi)_mm_unpackhi_epi8(__A, __B), - (__v16qi)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_unpackhi_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, - (__v32qi)_mm256_unpackhi_epi8(__A, __B), - (__v32qi)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_unpackhi_epi8(__mmask32 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, - (__v32qi)_mm256_unpackhi_epi8(__A, __B), - (__v32qi)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_unpackhi_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_unpackhi_epi16(__A, __B), - (__v8hi)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_unpackhi_epi16(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_unpackhi_epi16(__A, __B), - (__v8hi) _mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_unpackhi_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_unpackhi_epi16(__A, __B), - (__v16hi)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_unpackhi_epi16(__mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_unpackhi_epi16(__A, __B), - (__v16hi)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_unpacklo_epi8(__m128i __W, __mmask16 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, - (__v16qi)_mm_unpacklo_epi8(__A, __B), - (__v16qi)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_unpacklo_epi8(__mmask16 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectb_128((__mmask16)__U, - (__v16qi)_mm_unpacklo_epi8(__A, __B), - (__v16qi)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_unpacklo_epi8(__m256i __W, __mmask32 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, - (__v32qi)_mm256_unpacklo_epi8(__A, __B), - (__v32qi)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_unpacklo_epi8(__mmask32 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectb_256((__mmask32)__U, - (__v32qi)_mm256_unpacklo_epi8(__A, __B), - (__v32qi)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_unpacklo_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_unpacklo_epi16(__A, __B), - (__v8hi)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_unpacklo_epi16(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_unpacklo_epi16(__A, __B), - (__v8hi) _mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_unpacklo_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_unpacklo_epi16(__A, __B), - (__v16hi)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_unpacklo_epi16(__mmask16 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_unpacklo_epi16(__A, __B), - (__v16hi)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvtepi8_epi16(__m128i __W, __mmask8 __U, __m128i __A) -{ - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_cvtepi8_epi16(__A), - (__v8hi)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtepi8_epi16(__mmask8 __U, __m128i __A) -{ - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_cvtepi8_epi16(__A), - (__v8hi)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtepi8_epi16(__m256i __W, __mmask16 __U, __m128i __A) -{ - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_cvtepi8_epi16(__A), - (__v16hi)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtepi8_epi16(__mmask16 __U, __m128i __A) -{ - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_cvtepi8_epi16(__A), - (__v16hi)_mm256_setzero_si256()); -} - - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvtepu8_epi16(__m128i __W, __mmask8 __U, __m128i __A) -{ - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_cvtepu8_epi16(__A), - (__v8hi)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtepu8_epi16(__mmask8 __U, __m128i __A) -{ - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_cvtepu8_epi16(__A), - (__v8hi)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtepu8_epi16(__m256i __W, __mmask16 __U, __m128i __A) -{ - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_cvtepu8_epi16(__A), - (__v16hi)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtepu8_epi16 (__mmask16 __U, __m128i __A) -{ - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_cvtepu8_epi16(__A), - (__v16hi)_mm256_setzero_si256()); -} - - -#define _mm_mask_shufflehi_epi16(W, U, A, imm) \ - ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ - (__v8hi)_mm_shufflehi_epi16((A), (imm)), \ - (__v8hi)(__m128i)(W))) - -#define _mm_maskz_shufflehi_epi16(U, A, imm) \ - ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ - (__v8hi)_mm_shufflehi_epi16((A), (imm)), \ - (__v8hi)_mm_setzero_si128())) - -#define _mm256_mask_shufflehi_epi16(W, U, A, imm) \ - ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ - (__v16hi)_mm256_shufflehi_epi16((A), (imm)), \ - (__v16hi)(__m256i)(W))) - -#define _mm256_maskz_shufflehi_epi16(U, A, imm) \ - ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ - (__v16hi)_mm256_shufflehi_epi16((A), (imm)), \ - (__v16hi)_mm256_setzero_si256())) - -#define _mm_mask_shufflelo_epi16(W, U, A, imm) \ - ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ - (__v8hi)_mm_shufflelo_epi16((A), (imm)), \ - (__v8hi)(__m128i)(W))) - -#define _mm_maskz_shufflelo_epi16(U, A, imm) \ - ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ - (__v8hi)_mm_shufflelo_epi16((A), (imm)), \ - (__v8hi)_mm_setzero_si128())) - -#define _mm256_mask_shufflelo_epi16(W, U, A, imm) \ - ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ - (__v16hi)_mm256_shufflelo_epi16((A), \ - (imm)), \ - (__v16hi)(__m256i)(W))) - -#define _mm256_maskz_shufflelo_epi16(U, A, imm) \ - ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ - (__v16hi)_mm256_shufflelo_epi16((A), \ - (imm)), \ - (__v16hi)_mm256_setzero_si256())) - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_sllv_epi16(__m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_psllv16hi((__v16hi)__A, (__v16hi)__B); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_sllv_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_sllv_epi16(__A, __B), - (__v16hi)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_sllv_epi16(__mmask16 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_sllv_epi16(__A, __B), - (__v16hi)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_sllv_epi16(__m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_psllv8hi((__v8hi)__A, (__v8hi)__B); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_sllv_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_sllv_epi16(__A, __B), - (__v8hi)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_sllv_epi16(__mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_sllv_epi16(__A, __B), - (__v8hi)_mm_setzero_si128()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_sll_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_sll_epi16(__A, __B), - (__v8hi)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_sll_epi16 (__mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_sll_epi16(__A, __B), - (__v8hi)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_sll_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m128i __B) -{ - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_sll_epi16(__A, __B), - (__v16hi)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_sll_epi16(__mmask16 __U, __m256i __A, __m128i __B) -{ - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_sll_epi16(__A, __B), - (__v16hi)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_slli_epi16(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B) -{ - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_slli_epi16(__A, __B), - (__v8hi)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_slli_epi16 (__mmask8 __U, __m128i __A, unsigned int __B) -{ - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_slli_epi16(__A, __B), - (__v8hi)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_slli_epi16(__m256i __W, __mmask16 __U, __m256i __A, - unsigned int __B) -{ - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_slli_epi16(__A, __B), - (__v16hi)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_slli_epi16(__mmask16 __U, __m256i __A, unsigned int __B) -{ - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_slli_epi16(__A, __B), - (__v16hi)_mm256_setzero_si256()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_srlv_epi16(__m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_psrlv16hi((__v16hi)__A, (__v16hi)__B); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_srlv_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_srlv_epi16(__A, __B), - (__v16hi)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_srlv_epi16(__mmask16 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_srlv_epi16(__A, __B), - (__v16hi)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_srlv_epi16(__m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_psrlv8hi((__v8hi)__A, (__v8hi)__B); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_srlv_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_srlv_epi16(__A, __B), - (__v8hi)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_srlv_epi16(__mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_srlv_epi16(__A, __B), - (__v8hi)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_srav_epi16(__m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_psrav16hi((__v16hi)__A, (__v16hi)__B); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_srav_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_srav_epi16(__A, __B), - (__v16hi)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_srav_epi16(__mmask16 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_srav_epi16(__A, __B), - (__v16hi)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_srav_epi16(__m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_psrav8hi((__v8hi)__A, (__v8hi)__B); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_srav_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_srav_epi16(__A, __B), - (__v8hi)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_srav_epi16(__mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_srav_epi16(__A, __B), - (__v8hi)_mm_setzero_si128()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_sra_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_sra_epi16(__A, __B), - (__v8hi)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_sra_epi16(__mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_sra_epi16(__A, __B), - (__v8hi)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_sra_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m128i __B) -{ - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_sra_epi16(__A, __B), - (__v16hi)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_sra_epi16(__mmask16 __U, __m256i __A, __m128i __B) -{ - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_sra_epi16(__A, __B), - (__v16hi)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_srai_epi16(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B) -{ - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_srai_epi16(__A, __B), - (__v8hi)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_srai_epi16(__mmask8 __U, __m128i __A, unsigned int __B) -{ - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_srai_epi16(__A, __B), - (__v8hi)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_srai_epi16(__m256i __W, __mmask16 __U, __m256i __A, - unsigned int __B) -{ - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_srai_epi16(__A, __B), - (__v16hi)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_srai_epi16(__mmask16 __U, __m256i __A, unsigned int __B) -{ - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_srai_epi16(__A, __B), - (__v16hi)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_srl_epi16(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_srl_epi16(__A, __B), - (__v8hi)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_srl_epi16 (__mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_srl_epi16(__A, __B), - (__v8hi)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_srl_epi16(__m256i __W, __mmask16 __U, __m256i __A, __m128i __B) -{ - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_srl_epi16(__A, __B), - (__v16hi)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_srl_epi16(__mmask16 __U, __m256i __A, __m128i __B) -{ - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_srl_epi16(__A, __B), - (__v16hi)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_srli_epi16(__m128i __W, __mmask8 __U, __m128i __A, int __B) -{ - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_srli_epi16(__A, __B), - (__v8hi)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_srli_epi16 (__mmask8 __U, __m128i __A, int __B) -{ - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__U, - (__v8hi)_mm_srli_epi16(__A, __B), - (__v8hi)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_srli_epi16(__m256i __W, __mmask16 __U, __m256i __A, int __B) -{ - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_srli_epi16(__A, __B), - (__v16hi)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_srli_epi16(__mmask16 __U, __m256i __A, int __B) -{ - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__U, - (__v16hi)_mm256_srli_epi16(__A, __B), - (__v16hi)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_mov_epi16 (__m128i __W, __mmask8 __U, __m128i __A) -{ - return (__m128i) __builtin_ia32_selectw_128 ((__mmask8) __U, - (__v8hi) __A, - (__v8hi) __W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_mov_epi16 (__mmask8 __U, __m128i __A) -{ - return (__m128i) __builtin_ia32_selectw_128 ((__mmask8) __U, - (__v8hi) __A, - (__v8hi) _mm_setzero_si128 ()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_mov_epi16 (__m256i __W, __mmask16 __U, __m256i __A) -{ - return (__m256i) __builtin_ia32_selectw_256 ((__mmask16) __U, - (__v16hi) __A, - (__v16hi) __W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_mov_epi16 (__mmask16 __U, __m256i __A) -{ - return (__m256i) __builtin_ia32_selectw_256 ((__mmask16) __U, - (__v16hi) __A, - (__v16hi) _mm256_setzero_si256 ()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_mov_epi8 (__m128i __W, __mmask16 __U, __m128i __A) -{ - return (__m128i) __builtin_ia32_selectb_128 ((__mmask16) __U, - (__v16qi) __A, - (__v16qi) __W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_mov_epi8 (__mmask16 __U, __m128i __A) -{ - return (__m128i) __builtin_ia32_selectb_128 ((__mmask16) __U, - (__v16qi) __A, - (__v16qi) _mm_setzero_si128 ()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_mov_epi8 (__m256i __W, __mmask32 __U, __m256i __A) -{ - return (__m256i) __builtin_ia32_selectb_256 ((__mmask32) __U, - (__v32qi) __A, - (__v32qi) __W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_mov_epi8 (__mmask32 __U, __m256i __A) -{ - return (__m256i) __builtin_ia32_selectb_256 ((__mmask32) __U, - (__v32qi) __A, - (__v32qi) _mm256_setzero_si256 ()); -} - - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_set1_epi8 (__m128i __O, __mmask16 __M, char __A) -{ - return (__m128i) __builtin_ia32_selectb_128(__M, - (__v16qi) _mm_set1_epi8(__A), - (__v16qi) __O); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_set1_epi8 (__mmask16 __M, char __A) -{ - return (__m128i) __builtin_ia32_selectb_128(__M, - (__v16qi) _mm_set1_epi8(__A), - (__v16qi) _mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_set1_epi8 (__m256i __O, __mmask32 __M, char __A) -{ - return (__m256i) __builtin_ia32_selectb_256(__M, - (__v32qi) _mm256_set1_epi8(__A), - (__v32qi) __O); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_set1_epi8 (__mmask32 __M, char __A) -{ - return (__m256i) __builtin_ia32_selectb_256(__M, - (__v32qi) _mm256_set1_epi8(__A), - (__v32qi) _mm256_setzero_si256()); -} - -static __inline __m128i __DEFAULT_FN_ATTRS128 -_mm_loadu_epi16 (void const *__P) -{ - struct __loadu_epi16 { - __m128i_u __v; - } __attribute__((__packed__, __may_alias__)); - return ((const struct __loadu_epi16*)__P)->__v; -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_loadu_epi16 (__m128i __W, __mmask8 __U, void const *__P) -{ - return (__m128i) __builtin_ia32_loaddquhi128_mask ((const __v8hi *) __P, - (__v8hi) __W, - (__mmask8) __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_loadu_epi16 (__mmask8 __U, void const *__P) -{ - return (__m128i) __builtin_ia32_loaddquhi128_mask ((const __v8hi *) __P, - (__v8hi) - _mm_setzero_si128 (), - (__mmask8) __U); -} - -static __inline __m256i __DEFAULT_FN_ATTRS256 -_mm256_loadu_epi16 (void const *__P) -{ - struct __loadu_epi16 { - __m256i_u __v; - } __attribute__((__packed__, __may_alias__)); - return ((const struct __loadu_epi16*)__P)->__v; -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_loadu_epi16 (__m256i __W, __mmask16 __U, void const *__P) -{ - return (__m256i) __builtin_ia32_loaddquhi256_mask ((const __v16hi *) __P, - (__v16hi) __W, - (__mmask16) __U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_loadu_epi16 (__mmask16 __U, void const *__P) -{ - return (__m256i) __builtin_ia32_loaddquhi256_mask ((const __v16hi *) __P, - (__v16hi) - _mm256_setzero_si256 (), - (__mmask16) __U); -} - -static __inline __m128i __DEFAULT_FN_ATTRS128 -_mm_loadu_epi8 (void const *__P) -{ - struct __loadu_epi8 { - __m128i_u __v; - } __attribute__((__packed__, __may_alias__)); - return ((const struct __loadu_epi8*)__P)->__v; -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_loadu_epi8 (__m128i __W, __mmask16 __U, void const *__P) -{ - return (__m128i) __builtin_ia32_loaddquqi128_mask ((const __v16qi *) __P, - (__v16qi) __W, - (__mmask16) __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_loadu_epi8 (__mmask16 __U, void const *__P) -{ - return (__m128i) __builtin_ia32_loaddquqi128_mask ((const __v16qi *) __P, - (__v16qi) - _mm_setzero_si128 (), - (__mmask16) __U); -} - -static __inline __m256i __DEFAULT_FN_ATTRS256 -_mm256_loadu_epi8 (void const *__P) -{ - struct __loadu_epi8 { - __m256i_u __v; - } __attribute__((__packed__, __may_alias__)); - return ((const struct __loadu_epi8*)__P)->__v; -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_loadu_epi8 (__m256i __W, __mmask32 __U, void const *__P) -{ - return (__m256i) __builtin_ia32_loaddquqi256_mask ((const __v32qi *) __P, - (__v32qi) __W, - (__mmask32) __U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_loadu_epi8 (__mmask32 __U, void const *__P) -{ - return (__m256i) __builtin_ia32_loaddquqi256_mask ((const __v32qi *) __P, - (__v32qi) - _mm256_setzero_si256 (), - (__mmask32) __U); -} - -static __inline void __DEFAULT_FN_ATTRS128 -_mm_storeu_epi16 (void *__P, __m128i __A) -{ - struct __storeu_epi16 { - __m128i_u __v; - } __attribute__((__packed__, __may_alias__)); - ((struct __storeu_epi16*)__P)->__v = __A; -} - -static __inline__ void __DEFAULT_FN_ATTRS128 -_mm_mask_storeu_epi16 (void *__P, __mmask8 __U, __m128i __A) -{ - __builtin_ia32_storedquhi128_mask ((__v8hi *) __P, - (__v8hi) __A, - (__mmask8) __U); -} - -static __inline void __DEFAULT_FN_ATTRS256 -_mm256_storeu_epi16 (void *__P, __m256i __A) -{ - struct __storeu_epi16 { - __m256i_u __v; - } __attribute__((__packed__, __may_alias__)); - ((struct __storeu_epi16*)__P)->__v = __A; -} - -static __inline__ void __DEFAULT_FN_ATTRS256 -_mm256_mask_storeu_epi16 (void *__P, __mmask16 __U, __m256i __A) -{ - __builtin_ia32_storedquhi256_mask ((__v16hi *) __P, - (__v16hi) __A, - (__mmask16) __U); -} - -static __inline void __DEFAULT_FN_ATTRS128 -_mm_storeu_epi8 (void *__P, __m128i __A) -{ - struct __storeu_epi8 { - __m128i_u __v; - } __attribute__((__packed__, __may_alias__)); - ((struct __storeu_epi8*)__P)->__v = __A; -} - -static __inline__ void __DEFAULT_FN_ATTRS128 -_mm_mask_storeu_epi8 (void *__P, __mmask16 __U, __m128i __A) -{ - __builtin_ia32_storedquqi128_mask ((__v16qi *) __P, - (__v16qi) __A, - (__mmask16) __U); -} - -static __inline void __DEFAULT_FN_ATTRS256 -_mm256_storeu_epi8 (void *__P, __m256i __A) -{ - struct __storeu_epi8 { - __m256i_u __v; - } __attribute__((__packed__, __may_alias__)); - ((struct __storeu_epi8*)__P)->__v = __A; -} - -static __inline__ void __DEFAULT_FN_ATTRS256 -_mm256_mask_storeu_epi8 (void *__P, __mmask32 __U, __m256i __A) -{ - __builtin_ia32_storedquqi256_mask ((__v32qi *) __P, - (__v32qi) __A, - (__mmask32) __U); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS128 -_mm_test_epi8_mask (__m128i __A, __m128i __B) -{ - return _mm_cmpneq_epi8_mask (_mm_and_si128(__A, __B), _mm_setzero_si128()); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS128 -_mm_mask_test_epi8_mask (__mmask16 __U, __m128i __A, __m128i __B) -{ - return _mm_mask_cmpneq_epi8_mask (__U, _mm_and_si128 (__A, __B), - _mm_setzero_si128()); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS256 -_mm256_test_epi8_mask (__m256i __A, __m256i __B) -{ - return _mm256_cmpneq_epi8_mask (_mm256_and_si256(__A, __B), - _mm256_setzero_si256()); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS256 -_mm256_mask_test_epi8_mask (__mmask32 __U, __m256i __A, __m256i __B) -{ - return _mm256_mask_cmpneq_epi8_mask (__U, _mm256_and_si256(__A, __B), - _mm256_setzero_si256()); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS128 -_mm_test_epi16_mask (__m128i __A, __m128i __B) -{ - return _mm_cmpneq_epi16_mask (_mm_and_si128 (__A, __B), _mm_setzero_si128()); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS128 -_mm_mask_test_epi16_mask (__mmask8 __U, __m128i __A, __m128i __B) -{ - return _mm_mask_cmpneq_epi16_mask (__U, _mm_and_si128 (__A, __B), - _mm_setzero_si128()); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS256 -_mm256_test_epi16_mask (__m256i __A, __m256i __B) -{ - return _mm256_cmpneq_epi16_mask (_mm256_and_si256 (__A, __B), - _mm256_setzero_si256 ()); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS256 -_mm256_mask_test_epi16_mask (__mmask16 __U, __m256i __A, __m256i __B) -{ - return _mm256_mask_cmpneq_epi16_mask (__U, _mm256_and_si256(__A, __B), - _mm256_setzero_si256()); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS128 -_mm_testn_epi8_mask (__m128i __A, __m128i __B) -{ - return _mm_cmpeq_epi8_mask (_mm_and_si128 (__A, __B), _mm_setzero_si128()); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS128 -_mm_mask_testn_epi8_mask (__mmask16 __U, __m128i __A, __m128i __B) -{ - return _mm_mask_cmpeq_epi8_mask (__U, _mm_and_si128 (__A, __B), - _mm_setzero_si128()); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS256 -_mm256_testn_epi8_mask (__m256i __A, __m256i __B) -{ - return _mm256_cmpeq_epi8_mask (_mm256_and_si256 (__A, __B), - _mm256_setzero_si256()); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS256 -_mm256_mask_testn_epi8_mask (__mmask32 __U, __m256i __A, __m256i __B) -{ - return _mm256_mask_cmpeq_epi8_mask (__U, _mm256_and_si256 (__A, __B), - _mm256_setzero_si256()); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS128 -_mm_testn_epi16_mask (__m128i __A, __m128i __B) -{ - return _mm_cmpeq_epi16_mask (_mm_and_si128 (__A, __B), _mm_setzero_si128()); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS128 -_mm_mask_testn_epi16_mask (__mmask8 __U, __m128i __A, __m128i __B) -{ - return _mm_mask_cmpeq_epi16_mask (__U, _mm_and_si128(__A, __B), _mm_setzero_si128()); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS256 -_mm256_testn_epi16_mask (__m256i __A, __m256i __B) -{ - return _mm256_cmpeq_epi16_mask (_mm256_and_si256(__A, __B), - _mm256_setzero_si256()); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS256 -_mm256_mask_testn_epi16_mask (__mmask16 __U, __m256i __A, __m256i __B) -{ - return _mm256_mask_cmpeq_epi16_mask (__U, _mm256_and_si256 (__A, __B), - _mm256_setzero_si256()); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS128 -_mm_movepi8_mask (__m128i __A) -{ - return (__mmask16) __builtin_ia32_cvtb2mask128 ((__v16qi) __A); -} - -static __inline__ __mmask32 __DEFAULT_FN_ATTRS256 -_mm256_movepi8_mask (__m256i __A) -{ - return (__mmask32) __builtin_ia32_cvtb2mask256 ((__v32qi) __A); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS128 -_mm_movepi16_mask (__m128i __A) -{ - return (__mmask8) __builtin_ia32_cvtw2mask128 ((__v8hi) __A); -} - -static __inline__ __mmask16 __DEFAULT_FN_ATTRS256 -_mm256_movepi16_mask (__m256i __A) -{ - return (__mmask16) __builtin_ia32_cvtw2mask256 ((__v16hi) __A); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_movm_epi8 (__mmask16 __A) -{ - return (__m128i) __builtin_ia32_cvtmask2b128 (__A); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_movm_epi8 (__mmask32 __A) -{ - return (__m256i) __builtin_ia32_cvtmask2b256 (__A); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_movm_epi16 (__mmask8 __A) -{ - return (__m128i) __builtin_ia32_cvtmask2w128 (__A); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_movm_epi16 (__mmask16 __A) -{ - return (__m256i) __builtin_ia32_cvtmask2w256 (__A); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_broadcastb_epi8 (__m128i __O, __mmask16 __M, __m128i __A) -{ - return (__m128i)__builtin_ia32_selectb_128(__M, - (__v16qi) _mm_broadcastb_epi8(__A), - (__v16qi) __O); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_broadcastb_epi8 (__mmask16 __M, __m128i __A) -{ - return (__m128i)__builtin_ia32_selectb_128(__M, - (__v16qi) _mm_broadcastb_epi8(__A), - (__v16qi) _mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_broadcastb_epi8 (__m256i __O, __mmask32 __M, __m128i __A) -{ - return (__m256i)__builtin_ia32_selectb_256(__M, - (__v32qi) _mm256_broadcastb_epi8(__A), - (__v32qi) __O); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_broadcastb_epi8 (__mmask32 __M, __m128i __A) -{ - return (__m256i)__builtin_ia32_selectb_256(__M, - (__v32qi) _mm256_broadcastb_epi8(__A), - (__v32qi) _mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_broadcastw_epi16 (__m128i __O, __mmask8 __M, __m128i __A) -{ - return (__m128i)__builtin_ia32_selectw_128(__M, - (__v8hi) _mm_broadcastw_epi16(__A), - (__v8hi) __O); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_broadcastw_epi16 (__mmask8 __M, __m128i __A) -{ - return (__m128i)__builtin_ia32_selectw_128(__M, - (__v8hi) _mm_broadcastw_epi16(__A), - (__v8hi) _mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_broadcastw_epi16 (__m256i __O, __mmask16 __M, __m128i __A) -{ - return (__m256i)__builtin_ia32_selectw_256(__M, - (__v16hi) _mm256_broadcastw_epi16(__A), - (__v16hi) __O); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_broadcastw_epi16 (__mmask16 __M, __m128i __A) -{ - return (__m256i)__builtin_ia32_selectw_256(__M, - (__v16hi) _mm256_broadcastw_epi16(__A), - (__v16hi) _mm256_setzero_si256()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_set1_epi16 (__m256i __O, __mmask16 __M, short __A) -{ - return (__m256i) __builtin_ia32_selectw_256 (__M, - (__v16hi) _mm256_set1_epi16(__A), - (__v16hi) __O); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_set1_epi16 (__mmask16 __M, short __A) -{ - return (__m256i) __builtin_ia32_selectw_256(__M, - (__v16hi)_mm256_set1_epi16(__A), - (__v16hi) _mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_set1_epi16 (__m128i __O, __mmask8 __M, short __A) -{ - return (__m128i) __builtin_ia32_selectw_128(__M, - (__v8hi) _mm_set1_epi16(__A), - (__v8hi) __O); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_set1_epi16 (__mmask8 __M, short __A) -{ - return (__m128i) __builtin_ia32_selectw_128(__M, - (__v8hi) _mm_set1_epi16(__A), - (__v8hi) _mm_setzero_si128()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_permutexvar_epi16 (__m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_permvarhi128((__v8hi) __B, (__v8hi) __A); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_permutexvar_epi16 (__mmask8 __M, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, - (__v8hi)_mm_permutexvar_epi16(__A, __B), - (__v8hi) _mm_setzero_si128()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_permutexvar_epi16 (__m128i __W, __mmask8 __M, __m128i __A, - __m128i __B) -{ - return (__m128i)__builtin_ia32_selectw_128((__mmask8)__M, - (__v8hi)_mm_permutexvar_epi16(__A, __B), - (__v8hi)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_permutexvar_epi16 (__m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_permvarhi256((__v16hi) __B, (__v16hi) __A); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_permutexvar_epi16 (__mmask16 __M, __m256i __A, - __m256i __B) -{ - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, - (__v16hi)_mm256_permutexvar_epi16(__A, __B), - (__v16hi)_mm256_setzero_si256()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_permutexvar_epi16 (__m256i __W, __mmask16 __M, __m256i __A, - __m256i __B) -{ - return (__m256i)__builtin_ia32_selectw_256((__mmask16)__M, - (__v16hi)_mm256_permutexvar_epi16(__A, __B), - (__v16hi)__W); -} - -#define _mm_mask_alignr_epi8(W, U, A, B, N) \ - ((__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \ - (__v16qi)_mm_alignr_epi8((A), (B), (int)(N)), \ - (__v16qi)(__m128i)(W))) - -#define _mm_maskz_alignr_epi8(U, A, B, N) \ - ((__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \ - (__v16qi)_mm_alignr_epi8((A), (B), (int)(N)), \ - (__v16qi)_mm_setzero_si128())) - -#define _mm256_mask_alignr_epi8(W, U, A, B, N) \ - ((__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \ - (__v32qi)_mm256_alignr_epi8((A), (B), (int)(N)), \ - (__v32qi)(__m256i)(W))) - -#define _mm256_maskz_alignr_epi8(U, A, B, N) \ - ((__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \ - (__v32qi)_mm256_alignr_epi8((A), (B), (int)(N)), \ - (__v32qi)_mm256_setzero_si256())) - -#define _mm_dbsad_epu8(A, B, imm) \ - ((__m128i)__builtin_ia32_dbpsadbw128((__v16qi)(__m128i)(A), \ - (__v16qi)(__m128i)(B), (int)(imm))) - -#define _mm_mask_dbsad_epu8(W, U, A, B, imm) \ - ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ - (__v8hi)_mm_dbsad_epu8((A), (B), (imm)), \ - (__v8hi)(__m128i)(W))) - -#define _mm_maskz_dbsad_epu8(U, A, B, imm) \ - ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ - (__v8hi)_mm_dbsad_epu8((A), (B), (imm)), \ - (__v8hi)_mm_setzero_si128())) - -#define _mm256_dbsad_epu8(A, B, imm) \ - ((__m256i)__builtin_ia32_dbpsadbw256((__v32qi)(__m256i)(A), \ - (__v32qi)(__m256i)(B), (int)(imm))) - -#define _mm256_mask_dbsad_epu8(W, U, A, B, imm) \ - ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ - (__v16hi)_mm256_dbsad_epu8((A), (B), (imm)), \ - (__v16hi)(__m256i)(W))) - -#define _mm256_maskz_dbsad_epu8(U, A, B, imm) \ - ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ - (__v16hi)_mm256_dbsad_epu8((A), (B), (imm)), \ - (__v16hi)_mm256_setzero_si256())) - -#undef __DEFAULT_FN_ATTRS128 -#undef __DEFAULT_FN_ATTRS256 - -#endif /* __AVX512VLBWINTRIN_H */ diff --git a/include/avx512vlcdintrin.h b/include/avx512vlcdintrin.h deleted file mode 100644 index cc8b725..0000000 --- a/include/avx512vlcdintrin.h +++ /dev/null @@ -1,225 +0,0 @@ -/*===---- avx512vlcdintrin.h - AVX512VL and AVX512CD intrinsics ------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ -#ifndef __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __AVX512VLCDINTRIN_H -#define __AVX512VLCDINTRIN_H - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512cd"), __min_vector_width__(128))) -#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512cd"), __min_vector_width__(256))) - - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_broadcastmb_epi64 (__mmask8 __A) -{ - return (__m128i) _mm_set1_epi64x((long long) __A); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_broadcastmb_epi64 (__mmask8 __A) -{ - return (__m256i) _mm256_set1_epi64x((long long)__A); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_broadcastmw_epi32 (__mmask16 __A) -{ - return (__m128i) _mm_set1_epi32((int)__A); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_broadcastmw_epi32 (__mmask16 __A) -{ - return (__m256i) _mm256_set1_epi32((int)__A); -} - - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_conflict_epi64 (__m128i __A) -{ - return (__m128i) __builtin_ia32_vpconflictdi_128 ((__v2di) __A); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_conflict_epi64 (__m128i __W, __mmask8 __U, __m128i __A) -{ - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_conflict_epi64(__A), - (__v2di)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_conflict_epi64 (__mmask8 __U, __m128i __A) -{ - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_conflict_epi64(__A), - (__v2di)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_conflict_epi64 (__m256i __A) -{ - return (__m256i) __builtin_ia32_vpconflictdi_256 ((__v4di) __A); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_conflict_epi64 (__m256i __W, __mmask8 __U, __m256i __A) -{ - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_conflict_epi64(__A), - (__v4di)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_conflict_epi64 (__mmask8 __U, __m256i __A) -{ - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_conflict_epi64(__A), - (__v4di)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_conflict_epi32 (__m128i __A) -{ - return (__m128i) __builtin_ia32_vpconflictsi_128 ((__v4si) __A); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_conflict_epi32 (__m128i __W, __mmask8 __U, __m128i __A) -{ - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_conflict_epi32(__A), - (__v4si)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_conflict_epi32 (__mmask8 __U, __m128i __A) -{ - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_conflict_epi32(__A), - (__v4si)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_conflict_epi32 (__m256i __A) -{ - return (__m256i) __builtin_ia32_vpconflictsi_256 ((__v8si) __A); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_conflict_epi32 (__m256i __W, __mmask8 __U, __m256i __A) -{ - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_conflict_epi32(__A), - (__v8si)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_conflict_epi32 (__mmask8 __U, __m256i __A) -{ - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_conflict_epi32(__A), - (__v8si)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_lzcnt_epi32 (__m128i __A) -{ - return (__m128i) __builtin_ia32_vplzcntd_128 ((__v4si) __A); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_lzcnt_epi32 (__m128i __W, __mmask8 __U, __m128i __A) -{ - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_lzcnt_epi32(__A), - (__v4si)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_lzcnt_epi32 (__mmask8 __U, __m128i __A) -{ - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_lzcnt_epi32(__A), - (__v4si)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_lzcnt_epi32 (__m256i __A) -{ - return (__m256i) __builtin_ia32_vplzcntd_256 ((__v8si) __A); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_lzcnt_epi32 (__m256i __W, __mmask8 __U, __m256i __A) -{ - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_lzcnt_epi32(__A), - (__v8si)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_lzcnt_epi32 (__mmask8 __U, __m256i __A) -{ - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_lzcnt_epi32(__A), - (__v8si)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_lzcnt_epi64 (__m128i __A) -{ - return (__m128i) __builtin_ia32_vplzcntq_128 ((__v2di) __A); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_lzcnt_epi64 (__m128i __W, __mmask8 __U, __m128i __A) -{ - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_lzcnt_epi64(__A), - (__v2di)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_lzcnt_epi64 (__mmask8 __U, __m128i __A) -{ - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_lzcnt_epi64(__A), - (__v2di)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_lzcnt_epi64 (__m256i __A) -{ - return (__m256i) __builtin_ia32_vplzcntq_256 ((__v4di) __A); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_lzcnt_epi64 (__m256i __W, __mmask8 __U, __m256i __A) -{ - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_lzcnt_epi64(__A), - (__v4di)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_lzcnt_epi64 (__mmask8 __U, __m256i __A) -{ - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_lzcnt_epi64(__A), - (__v4di)_mm256_setzero_si256()); -} - -#undef __DEFAULT_FN_ATTRS128 -#undef __DEFAULT_FN_ATTRS256 - -#endif /* __AVX512VLCDINTRIN_H */ diff --git a/include/avx512vldqintrin.h b/include/avx512vldqintrin.h deleted file mode 100644 index 713e1a1..0000000 --- a/include/avx512vldqintrin.h +++ /dev/null @@ -1,1167 +0,0 @@ -/*===---- avx512vldqintrin.h - AVX512VL and AVX512DQ intrinsics ------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ - -#ifndef __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __AVX512VLDQINTRIN_H -#define __AVX512VLDQINTRIN_H - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512dq"), __min_vector_width__(128))) -#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512dq"), __min_vector_width__(256))) - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mullo_epi64 (__m256i __A, __m256i __B) { - return (__m256i) ((__v4du) __A * (__v4du) __B); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_mullo_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_mullo_epi64(__A, __B), - (__v4di)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_mullo_epi64(__mmask8 __U, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_mullo_epi64(__A, __B), - (__v4di)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mullo_epi64 (__m128i __A, __m128i __B) { - return (__m128i) ((__v2du) __A * (__v2du) __B); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_mullo_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_mullo_epi64(__A, __B), - (__v2di)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_mullo_epi64(__mmask8 __U, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_mullo_epi64(__A, __B), - (__v2di)_mm_setzero_si128()); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_mask_andnot_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_andnot_pd(__A, __B), - (__v4df)__W); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_maskz_andnot_pd(__mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_andnot_pd(__A, __B), - (__v4df)_mm256_setzero_pd()); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_andnot_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_andnot_pd(__A, __B), - (__v2df)__W); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_andnot_pd(__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_andnot_pd(__A, __B), - (__v2df)_mm_setzero_pd()); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_mask_andnot_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_andnot_ps(__A, __B), - (__v8sf)__W); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_maskz_andnot_ps(__mmask8 __U, __m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_andnot_ps(__A, __B), - (__v8sf)_mm256_setzero_ps()); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_andnot_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_andnot_ps(__A, __B), - (__v4sf)__W); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_andnot_ps(__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_andnot_ps(__A, __B), - (__v4sf)_mm_setzero_ps()); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_mask_and_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_and_pd(__A, __B), - (__v4df)__W); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_maskz_and_pd(__mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_and_pd(__A, __B), - (__v4df)_mm256_setzero_pd()); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_and_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_and_pd(__A, __B), - (__v2df)__W); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_and_pd(__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_and_pd(__A, __B), - (__v2df)_mm_setzero_pd()); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_mask_and_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_and_ps(__A, __B), - (__v8sf)__W); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_maskz_and_ps(__mmask8 __U, __m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_and_ps(__A, __B), - (__v8sf)_mm256_setzero_ps()); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_and_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_and_ps(__A, __B), - (__v4sf)__W); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_and_ps(__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_and_ps(__A, __B), - (__v4sf)_mm_setzero_ps()); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_mask_xor_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_xor_pd(__A, __B), - (__v4df)__W); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_maskz_xor_pd(__mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_xor_pd(__A, __B), - (__v4df)_mm256_setzero_pd()); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_xor_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_xor_pd(__A, __B), - (__v2df)__W); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_xor_pd (__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_xor_pd(__A, __B), - (__v2df)_mm_setzero_pd()); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_mask_xor_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_xor_ps(__A, __B), - (__v8sf)__W); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_maskz_xor_ps(__mmask8 __U, __m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_xor_ps(__A, __B), - (__v8sf)_mm256_setzero_ps()); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_xor_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_xor_ps(__A, __B), - (__v4sf)__W); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_xor_ps(__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_xor_ps(__A, __B), - (__v4sf)_mm_setzero_ps()); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_mask_or_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_or_pd(__A, __B), - (__v4df)__W); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_maskz_or_pd(__mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_or_pd(__A, __B), - (__v4df)_mm256_setzero_pd()); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_or_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_or_pd(__A, __B), - (__v2df)__W); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_or_pd(__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_or_pd(__A, __B), - (__v2df)_mm_setzero_pd()); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_mask_or_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_or_ps(__A, __B), - (__v8sf)__W); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_maskz_or_ps(__mmask8 __U, __m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_or_ps(__A, __B), - (__v8sf)_mm256_setzero_ps()); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_or_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_or_ps(__A, __B), - (__v4sf)__W); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_or_ps(__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_or_ps(__A, __B), - (__v4sf)_mm_setzero_ps()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_cvtpd_epi64 (__m128d __A) { - return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A, - (__v2di) _mm_setzero_si128(), - (__mmask8) -1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvtpd_epi64 (__m128i __W, __mmask8 __U, __m128d __A) { - return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A, - (__v2di) __W, - (__mmask8) __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtpd_epi64 (__mmask8 __U, __m128d __A) { - return (__m128i) __builtin_ia32_cvtpd2qq128_mask ((__v2df) __A, - (__v2di) _mm_setzero_si128(), - (__mmask8) __U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_cvtpd_epi64 (__m256d __A) { - return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A, - (__v4di) _mm256_setzero_si256(), - (__mmask8) -1); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtpd_epi64 (__m256i __W, __mmask8 __U, __m256d __A) { - return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A, - (__v4di) __W, - (__mmask8) __U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtpd_epi64 (__mmask8 __U, __m256d __A) { - return (__m256i) __builtin_ia32_cvtpd2qq256_mask ((__v4df) __A, - (__v4di) _mm256_setzero_si256(), - (__mmask8) __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_cvtpd_epu64 (__m128d __A) { - return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A, - (__v2di) _mm_setzero_si128(), - (__mmask8) -1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvtpd_epu64 (__m128i __W, __mmask8 __U, __m128d __A) { - return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A, - (__v2di) __W, - (__mmask8) __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtpd_epu64 (__mmask8 __U, __m128d __A) { - return (__m128i) __builtin_ia32_cvtpd2uqq128_mask ((__v2df) __A, - (__v2di) _mm_setzero_si128(), - (__mmask8) __U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_cvtpd_epu64 (__m256d __A) { - return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A, - (__v4di) _mm256_setzero_si256(), - (__mmask8) -1); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtpd_epu64 (__m256i __W, __mmask8 __U, __m256d __A) { - return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A, - (__v4di) __W, - (__mmask8) __U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtpd_epu64 (__mmask8 __U, __m256d __A) { - return (__m256i) __builtin_ia32_cvtpd2uqq256_mask ((__v4df) __A, - (__v4di) _mm256_setzero_si256(), - (__mmask8) __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_cvtps_epi64 (__m128 __A) { - return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A, - (__v2di) _mm_setzero_si128(), - (__mmask8) -1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvtps_epi64 (__m128i __W, __mmask8 __U, __m128 __A) { - return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A, - (__v2di) __W, - (__mmask8) __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtps_epi64 (__mmask8 __U, __m128 __A) { - return (__m128i) __builtin_ia32_cvtps2qq128_mask ((__v4sf) __A, - (__v2di) _mm_setzero_si128(), - (__mmask8) __U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_cvtps_epi64 (__m128 __A) { - return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A, - (__v4di) _mm256_setzero_si256(), - (__mmask8) -1); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtps_epi64 (__m256i __W, __mmask8 __U, __m128 __A) { - return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A, - (__v4di) __W, - (__mmask8) __U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtps_epi64 (__mmask8 __U, __m128 __A) { - return (__m256i) __builtin_ia32_cvtps2qq256_mask ((__v4sf) __A, - (__v4di) _mm256_setzero_si256(), - (__mmask8) __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_cvtps_epu64 (__m128 __A) { - return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A, - (__v2di) _mm_setzero_si128(), - (__mmask8) -1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvtps_epu64 (__m128i __W, __mmask8 __U, __m128 __A) { - return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A, - (__v2di) __W, - (__mmask8) __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtps_epu64 (__mmask8 __U, __m128 __A) { - return (__m128i) __builtin_ia32_cvtps2uqq128_mask ((__v4sf) __A, - (__v2di) _mm_setzero_si128(), - (__mmask8) __U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_cvtps_epu64 (__m128 __A) { - return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A, - (__v4di) _mm256_setzero_si256(), - (__mmask8) -1); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtps_epu64 (__m256i __W, __mmask8 __U, __m128 __A) { - return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A, - (__v4di) __W, - (__mmask8) __U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtps_epu64 (__mmask8 __U, __m128 __A) { - return (__m256i) __builtin_ia32_cvtps2uqq256_mask ((__v4sf) __A, - (__v4di) _mm256_setzero_si256(), - (__mmask8) __U); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_cvtepi64_pd (__m128i __A) { - return (__m128d)__builtin_convertvector((__v2di)__A, __v2df); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_cvtepi64_pd (__m128d __W, __mmask8 __U, __m128i __A) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_cvtepi64_pd(__A), - (__v2df)__W); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtepi64_pd (__mmask8 __U, __m128i __A) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_cvtepi64_pd(__A), - (__v2df)_mm_setzero_pd()); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_cvtepi64_pd (__m256i __A) { - return (__m256d)__builtin_convertvector((__v4di)__A, __v4df); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtepi64_pd (__m256d __W, __mmask8 __U, __m256i __A) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_cvtepi64_pd(__A), - (__v4df)__W); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtepi64_pd (__mmask8 __U, __m256i __A) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_cvtepi64_pd(__A), - (__v4df)_mm256_setzero_pd()); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_cvtepi64_ps (__m128i __A) { - return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A, - (__v4sf) _mm_setzero_ps(), - (__mmask8) -1); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_cvtepi64_ps (__m128 __W, __mmask8 __U, __m128i __A) { - return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A, - (__v4sf) __W, - (__mmask8) __U); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtepi64_ps (__mmask8 __U, __m128i __A) { - return (__m128) __builtin_ia32_cvtqq2ps128_mask ((__v2di) __A, - (__v4sf) _mm_setzero_ps(), - (__mmask8) __U); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS256 -_mm256_cvtepi64_ps (__m256i __A) { - return (__m128)__builtin_convertvector((__v4di)__A, __v4sf); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtepi64_ps (__m128 __W, __mmask8 __U, __m256i __A) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm256_cvtepi64_ps(__A), - (__v4sf)__W); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtepi64_ps (__mmask8 __U, __m256i __A) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm256_cvtepi64_ps(__A), - (__v4sf)_mm_setzero_ps()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_cvttpd_epi64 (__m128d __A) { - return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A, - (__v2di) _mm_setzero_si128(), - (__mmask8) -1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvttpd_epi64 (__m128i __W, __mmask8 __U, __m128d __A) { - return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A, - (__v2di) __W, - (__mmask8) __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvttpd_epi64 (__mmask8 __U, __m128d __A) { - return (__m128i) __builtin_ia32_cvttpd2qq128_mask ((__v2df) __A, - (__v2di) _mm_setzero_si128(), - (__mmask8) __U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_cvttpd_epi64 (__m256d __A) { - return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A, - (__v4di) _mm256_setzero_si256(), - (__mmask8) -1); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_cvttpd_epi64 (__m256i __W, __mmask8 __U, __m256d __A) { - return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A, - (__v4di) __W, - (__mmask8) __U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvttpd_epi64 (__mmask8 __U, __m256d __A) { - return (__m256i) __builtin_ia32_cvttpd2qq256_mask ((__v4df) __A, - (__v4di) _mm256_setzero_si256(), - (__mmask8) __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_cvttpd_epu64 (__m128d __A) { - return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A, - (__v2di) _mm_setzero_si128(), - (__mmask8) -1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvttpd_epu64 (__m128i __W, __mmask8 __U, __m128d __A) { - return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A, - (__v2di) __W, - (__mmask8) __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvttpd_epu64 (__mmask8 __U, __m128d __A) { - return (__m128i) __builtin_ia32_cvttpd2uqq128_mask ((__v2df) __A, - (__v2di) _mm_setzero_si128(), - (__mmask8) __U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_cvttpd_epu64 (__m256d __A) { - return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A, - (__v4di) _mm256_setzero_si256(), - (__mmask8) -1); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_cvttpd_epu64 (__m256i __W, __mmask8 __U, __m256d __A) { - return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A, - (__v4di) __W, - (__mmask8) __U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvttpd_epu64 (__mmask8 __U, __m256d __A) { - return (__m256i) __builtin_ia32_cvttpd2uqq256_mask ((__v4df) __A, - (__v4di) _mm256_setzero_si256(), - (__mmask8) __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_cvttps_epi64 (__m128 __A) { - return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A, - (__v2di) _mm_setzero_si128(), - (__mmask8) -1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvttps_epi64 (__m128i __W, __mmask8 __U, __m128 __A) { - return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A, - (__v2di) __W, - (__mmask8) __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvttps_epi64 (__mmask8 __U, __m128 __A) { - return (__m128i) __builtin_ia32_cvttps2qq128_mask ((__v4sf) __A, - (__v2di) _mm_setzero_si128(), - (__mmask8) __U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_cvttps_epi64 (__m128 __A) { - return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A, - (__v4di) _mm256_setzero_si256(), - (__mmask8) -1); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_cvttps_epi64 (__m256i __W, __mmask8 __U, __m128 __A) { - return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A, - (__v4di) __W, - (__mmask8) __U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvttps_epi64 (__mmask8 __U, __m128 __A) { - return (__m256i) __builtin_ia32_cvttps2qq256_mask ((__v4sf) __A, - (__v4di) _mm256_setzero_si256(), - (__mmask8) __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_cvttps_epu64 (__m128 __A) { - return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A, - (__v2di) _mm_setzero_si128(), - (__mmask8) -1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvttps_epu64 (__m128i __W, __mmask8 __U, __m128 __A) { - return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A, - (__v2di) __W, - (__mmask8) __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvttps_epu64 (__mmask8 __U, __m128 __A) { - return (__m128i) __builtin_ia32_cvttps2uqq128_mask ((__v4sf) __A, - (__v2di) _mm_setzero_si128(), - (__mmask8) __U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_cvttps_epu64 (__m128 __A) { - return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A, - (__v4di) _mm256_setzero_si256(), - (__mmask8) -1); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_cvttps_epu64 (__m256i __W, __mmask8 __U, __m128 __A) { - return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A, - (__v4di) __W, - (__mmask8) __U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvttps_epu64 (__mmask8 __U, __m128 __A) { - return (__m256i) __builtin_ia32_cvttps2uqq256_mask ((__v4sf) __A, - (__v4di) _mm256_setzero_si256(), - (__mmask8) __U); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_cvtepu64_pd (__m128i __A) { - return (__m128d)__builtin_convertvector((__v2du)__A, __v2df); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_cvtepu64_pd (__m128d __W, __mmask8 __U, __m128i __A) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_cvtepu64_pd(__A), - (__v2df)__W); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtepu64_pd (__mmask8 __U, __m128i __A) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_cvtepu64_pd(__A), - (__v2df)_mm_setzero_pd()); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_cvtepu64_pd (__m256i __A) { - return (__m256d)__builtin_convertvector((__v4du)__A, __v4df); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtepu64_pd (__m256d __W, __mmask8 __U, __m256i __A) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_cvtepu64_pd(__A), - (__v4df)__W); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtepu64_pd (__mmask8 __U, __m256i __A) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_cvtepu64_pd(__A), - (__v4df)_mm256_setzero_pd()); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_cvtepu64_ps (__m128i __A) { - return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A, - (__v4sf) _mm_setzero_ps(), - (__mmask8) -1); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_cvtepu64_ps (__m128 __W, __mmask8 __U, __m128i __A) { - return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A, - (__v4sf) __W, - (__mmask8) __U); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtepu64_ps (__mmask8 __U, __m128i __A) { - return (__m128) __builtin_ia32_cvtuqq2ps128_mask ((__v2di) __A, - (__v4sf) _mm_setzero_ps(), - (__mmask8) __U); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS256 -_mm256_cvtepu64_ps (__m256i __A) { - return (__m128)__builtin_convertvector((__v4du)__A, __v4sf); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtepu64_ps (__m128 __W, __mmask8 __U, __m256i __A) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm256_cvtepu64_ps(__A), - (__v4sf)__W); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtepu64_ps (__mmask8 __U, __m256i __A) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm256_cvtepu64_ps(__A), - (__v4sf)_mm_setzero_ps()); -} - -#define _mm_range_pd(A, B, C) \ - ((__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), (int)(C), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1)) - -#define _mm_mask_range_pd(W, U, A, B, C) \ - ((__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), (int)(C), \ - (__v2df)(__m128d)(W), \ - (__mmask8)(U))) - -#define _mm_maskz_range_pd(U, A, B, C) \ - ((__m128d)__builtin_ia32_rangepd128_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), (int)(C), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U))) - -#define _mm256_range_pd(A, B, C) \ - ((__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \ - (__v4df)(__m256d)(B), (int)(C), \ - (__v4df)_mm256_setzero_pd(), \ - (__mmask8)-1)) - -#define _mm256_mask_range_pd(W, U, A, B, C) \ - ((__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \ - (__v4df)(__m256d)(B), (int)(C), \ - (__v4df)(__m256d)(W), \ - (__mmask8)(U))) - -#define _mm256_maskz_range_pd(U, A, B, C) \ - ((__m256d)__builtin_ia32_rangepd256_mask((__v4df)(__m256d)(A), \ - (__v4df)(__m256d)(B), (int)(C), \ - (__v4df)_mm256_setzero_pd(), \ - (__mmask8)(U))) - -#define _mm_range_ps(A, B, C) \ - ((__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), (int)(C), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)-1)) - -#define _mm_mask_range_ps(W, U, A, B, C) \ - ((__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), (int)(C), \ - (__v4sf)(__m128)(W), (__mmask8)(U))) - -#define _mm_maskz_range_ps(U, A, B, C) \ - ((__m128)__builtin_ia32_rangeps128_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), (int)(C), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U))) - -#define _mm256_range_ps(A, B, C) \ - ((__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \ - (__v8sf)(__m256)(B), (int)(C), \ - (__v8sf)_mm256_setzero_ps(), \ - (__mmask8)-1)) - -#define _mm256_mask_range_ps(W, U, A, B, C) \ - ((__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \ - (__v8sf)(__m256)(B), (int)(C), \ - (__v8sf)(__m256)(W), (__mmask8)(U))) - -#define _mm256_maskz_range_ps(U, A, B, C) \ - ((__m256)__builtin_ia32_rangeps256_mask((__v8sf)(__m256)(A), \ - (__v8sf)(__m256)(B), (int)(C), \ - (__v8sf)_mm256_setzero_ps(), \ - (__mmask8)(U))) - -#define _mm_reduce_pd(A, B) \ - ((__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1)) - -#define _mm_mask_reduce_pd(W, U, A, B) \ - ((__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \ - (__v2df)(__m128d)(W), \ - (__mmask8)(U))) - -#define _mm_maskz_reduce_pd(U, A, B) \ - ((__m128d)__builtin_ia32_reducepd128_mask((__v2df)(__m128d)(A), (int)(B), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U))) - -#define _mm256_reduce_pd(A, B) \ - ((__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \ - (__v4df)_mm256_setzero_pd(), \ - (__mmask8)-1)) - -#define _mm256_mask_reduce_pd(W, U, A, B) \ - ((__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \ - (__v4df)(__m256d)(W), \ - (__mmask8)(U))) - -#define _mm256_maskz_reduce_pd(U, A, B) \ - ((__m256d)__builtin_ia32_reducepd256_mask((__v4df)(__m256d)(A), (int)(B), \ - (__v4df)_mm256_setzero_pd(), \ - (__mmask8)(U))) - -#define _mm_reduce_ps(A, B) \ - ((__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)-1)) - -#define _mm_mask_reduce_ps(W, U, A, B) \ - ((__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \ - (__v4sf)(__m128)(W), \ - (__mmask8)(U))) - -#define _mm_maskz_reduce_ps(U, A, B) \ - ((__m128)__builtin_ia32_reduceps128_mask((__v4sf)(__m128)(A), (int)(B), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U))) - -#define _mm256_reduce_ps(A, B) \ - ((__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \ - (__v8sf)_mm256_setzero_ps(), \ - (__mmask8)-1)) - -#define _mm256_mask_reduce_ps(W, U, A, B) \ - ((__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \ - (__v8sf)(__m256)(W), \ - (__mmask8)(U))) - -#define _mm256_maskz_reduce_ps(U, A, B) \ - ((__m256)__builtin_ia32_reduceps256_mask((__v8sf)(__m256)(A), (int)(B), \ - (__v8sf)_mm256_setzero_ps(), \ - (__mmask8)(U))) - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS128 -_mm_movepi32_mask (__m128i __A) -{ - return (__mmask8) __builtin_ia32_cvtd2mask128 ((__v4si) __A); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS256 -_mm256_movepi32_mask (__m256i __A) -{ - return (__mmask8) __builtin_ia32_cvtd2mask256 ((__v8si) __A); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_movm_epi32 (__mmask8 __A) -{ - return (__m128i) __builtin_ia32_cvtmask2d128 (__A); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_movm_epi32 (__mmask8 __A) -{ - return (__m256i) __builtin_ia32_cvtmask2d256 (__A); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_movm_epi64 (__mmask8 __A) -{ - return (__m128i) __builtin_ia32_cvtmask2q128 (__A); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_movm_epi64 (__mmask8 __A) -{ - return (__m256i) __builtin_ia32_cvtmask2q256 (__A); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS128 -_mm_movepi64_mask (__m128i __A) -{ - return (__mmask8) __builtin_ia32_cvtq2mask128 ((__v2di) __A); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS256 -_mm256_movepi64_mask (__m256i __A) -{ - return (__mmask8) __builtin_ia32_cvtq2mask256 ((__v4di) __A); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_broadcast_f32x2 (__m128 __A) -{ - return (__m256)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A, - 0, 1, 0, 1, 0, 1, 0, 1); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_mask_broadcast_f32x2 (__m256 __O, __mmask8 __M, __m128 __A) -{ - return (__m256)__builtin_ia32_selectps_256((__mmask8)__M, - (__v8sf)_mm256_broadcast_f32x2(__A), - (__v8sf)__O); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_maskz_broadcast_f32x2 (__mmask8 __M, __m128 __A) -{ - return (__m256)__builtin_ia32_selectps_256((__mmask8)__M, - (__v8sf)_mm256_broadcast_f32x2(__A), - (__v8sf)_mm256_setzero_ps()); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_broadcast_f64x2(__m128d __A) -{ - return (__m256d)__builtin_shufflevector((__v2df)__A, (__v2df)__A, - 0, 1, 0, 1); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_mask_broadcast_f64x2(__m256d __O, __mmask8 __M, __m128d __A) -{ - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__M, - (__v4df)_mm256_broadcast_f64x2(__A), - (__v4df)__O); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_maskz_broadcast_f64x2 (__mmask8 __M, __m128d __A) -{ - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__M, - (__v4df)_mm256_broadcast_f64x2(__A), - (__v4df)_mm256_setzero_pd()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_broadcast_i32x2 (__m128i __A) -{ - return (__m128i)__builtin_shufflevector((__v4si)__A, (__v4si)__A, - 0, 1, 0, 1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_broadcast_i32x2 (__m128i __O, __mmask8 __M, __m128i __A) -{ - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, - (__v4si)_mm_broadcast_i32x2(__A), - (__v4si)__O); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A) -{ - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, - (__v4si)_mm_broadcast_i32x2(__A), - (__v4si)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_broadcast_i32x2 (__m128i __A) -{ - return (__m256i)__builtin_shufflevector((__v4si)__A, (__v4si)__A, - 0, 1, 0, 1, 0, 1, 0, 1); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_broadcast_i32x2 (__m256i __O, __mmask8 __M, __m128i __A) -{ - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, - (__v8si)_mm256_broadcast_i32x2(__A), - (__v8si)__O); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_broadcast_i32x2 (__mmask8 __M, __m128i __A) -{ - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, - (__v8si)_mm256_broadcast_i32x2(__A), - (__v8si)_mm256_setzero_si256()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_broadcast_i64x2(__m128i __A) -{ - return (__m256i)__builtin_shufflevector((__v2di)__A, (__v2di)__A, - 0, 1, 0, 1); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_broadcast_i64x2(__m256i __O, __mmask8 __M, __m128i __A) -{ - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, - (__v4di)_mm256_broadcast_i64x2(__A), - (__v4di)__O); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_broadcast_i64x2 (__mmask8 __M, __m128i __A) -{ - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, - (__v4di)_mm256_broadcast_i64x2(__A), - (__v4di)_mm256_setzero_si256()); -} - -#define _mm256_extractf64x2_pd(A, imm) \ - ((__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \ - (int)(imm), \ - (__v2df)_mm_undefined_pd(), \ - (__mmask8)-1)) - -#define _mm256_mask_extractf64x2_pd(W, U, A, imm) \ - ((__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \ - (int)(imm), \ - (__v2df)(__m128d)(W), \ - (__mmask8)(U))) - -#define _mm256_maskz_extractf64x2_pd(U, A, imm) \ - ((__m128d)__builtin_ia32_extractf64x2_256_mask((__v4df)(__m256d)(A), \ - (int)(imm), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U))) - -#define _mm256_extracti64x2_epi64(A, imm) \ - ((__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \ - (int)(imm), \ - (__v2di)_mm_undefined_si128(), \ - (__mmask8)-1)) - -#define _mm256_mask_extracti64x2_epi64(W, U, A, imm) \ - ((__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \ - (int)(imm), \ - (__v2di)(__m128i)(W), \ - (__mmask8)(U))) - -#define _mm256_maskz_extracti64x2_epi64(U, A, imm) \ - ((__m128i)__builtin_ia32_extracti64x2_256_mask((__v4di)(__m256i)(A), \ - (int)(imm), \ - (__v2di)_mm_setzero_si128(), \ - (__mmask8)(U))) - -#define _mm256_insertf64x2(A, B, imm) \ - ((__m256d)__builtin_ia32_insertf64x2_256((__v4df)(__m256d)(A), \ - (__v2df)(__m128d)(B), (int)(imm))) - -#define _mm256_mask_insertf64x2(W, U, A, B, imm) \ - ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ - (__v4df)_mm256_insertf64x2((A), (B), (imm)), \ - (__v4df)(__m256d)(W))) - -#define _mm256_maskz_insertf64x2(U, A, B, imm) \ - ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ - (__v4df)_mm256_insertf64x2((A), (B), (imm)), \ - (__v4df)_mm256_setzero_pd())) - -#define _mm256_inserti64x2(A, B, imm) \ - ((__m256i)__builtin_ia32_inserti64x2_256((__v4di)(__m256i)(A), \ - (__v2di)(__m128i)(B), (int)(imm))) - -#define _mm256_mask_inserti64x2(W, U, A, B, imm) \ - ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ - (__v4di)_mm256_inserti64x2((A), (B), (imm)), \ - (__v4di)(__m256i)(W))) - -#define _mm256_maskz_inserti64x2(U, A, B, imm) \ - ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ - (__v4di)_mm256_inserti64x2((A), (B), (imm)), \ - (__v4di)_mm256_setzero_si256())) - -#define _mm_mask_fpclass_pd_mask(U, A, imm) \ - ((__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)(__m128d)(A), (int)(imm), \ - (__mmask8)(U))) - -#define _mm_fpclass_pd_mask(A, imm) \ - ((__mmask8)__builtin_ia32_fpclasspd128_mask((__v2df)(__m128d)(A), (int)(imm), \ - (__mmask8)-1)) - -#define _mm256_mask_fpclass_pd_mask(U, A, imm) \ - ((__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)(__m256d)(A), (int)(imm), \ - (__mmask8)(U))) - -#define _mm256_fpclass_pd_mask(A, imm) \ - ((__mmask8)__builtin_ia32_fpclasspd256_mask((__v4df)(__m256d)(A), (int)(imm), \ - (__mmask8)-1)) - -#define _mm_mask_fpclass_ps_mask(U, A, imm) \ - ((__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)(__m128)(A), (int)(imm), \ - (__mmask8)(U))) - -#define _mm_fpclass_ps_mask(A, imm) \ - ((__mmask8)__builtin_ia32_fpclassps128_mask((__v4sf)(__m128)(A), (int)(imm), \ - (__mmask8)-1)) - -#define _mm256_mask_fpclass_ps_mask(U, A, imm) \ - ((__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)(__m256)(A), (int)(imm), \ - (__mmask8)(U))) - -#define _mm256_fpclass_ps_mask(A, imm) \ - ((__mmask8)__builtin_ia32_fpclassps256_mask((__v8sf)(__m256)(A), (int)(imm), \ - (__mmask8)-1)) - -#undef __DEFAULT_FN_ATTRS128 -#undef __DEFAULT_FN_ATTRS256 - -#endif diff --git a/include/avx512vlfp16intrin.h b/include/avx512vlfp16intrin.h deleted file mode 100644 index 3d27853..0000000 --- a/include/avx512vlfp16intrin.h +++ /dev/null @@ -1,2068 +0,0 @@ -/*===---------- avx512vlfp16intrin.h - AVX512-FP16 intrinsics --------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ -#ifndef __IMMINTRIN_H -#error \ - "Never use directly; include instead." -#endif - -#ifndef __AVX512VLFP16INTRIN_H -#define __AVX512VLFP16INTRIN_H - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS256 \ - __attribute__((__always_inline__, __nodebug__, \ - __target__("avx512fp16, avx512vl"), \ - __min_vector_width__(256))) -#define __DEFAULT_FN_ATTRS128 \ - __attribute__((__always_inline__, __nodebug__, \ - __target__("avx512fp16, avx512vl"), \ - __min_vector_width__(128))) - -static __inline__ _Float16 __DEFAULT_FN_ATTRS128 _mm_cvtsh_h(__m128h __a) { - return __a[0]; -} - -static __inline__ _Float16 __DEFAULT_FN_ATTRS256 _mm256_cvtsh_h(__m256h __a) { - return __a[0]; -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_set_sh(_Float16 __h) { - return __extension__(__m128h){__h, 0, 0, 0, 0, 0, 0, 0}; -} - -static __inline __m128h __DEFAULT_FN_ATTRS128 _mm_set1_ph(_Float16 __h) { - return (__m128h)(__v8hf){__h, __h, __h, __h, __h, __h, __h, __h}; -} - -static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_set1_ph(_Float16 __h) { - return (__m256h)(__v16hf){__h, __h, __h, __h, __h, __h, __h, __h, - __h, __h, __h, __h, __h, __h, __h, __h}; -} - -static __inline __m128h __DEFAULT_FN_ATTRS128 -_mm_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4, - _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8) { - return (__m128h)(__v8hf){__h8, __h7, __h6, __h5, __h4, __h3, __h2, __h1}; -} - -static __inline __m256h __DEFAULT_FN_ATTRS256 -_mm256_set1_pch(_Float16 _Complex h) { - return (__m256h)_mm256_set1_ps(__builtin_bit_cast(float, h)); -} - -static __inline __m128h __DEFAULT_FN_ATTRS128 -_mm_set1_pch(_Float16 _Complex h) { - return (__m128h)_mm_set1_ps(__builtin_bit_cast(float, h)); -} - -static __inline __m256h __DEFAULT_FN_ATTRS256 -_mm256_set_ph(_Float16 __h1, _Float16 __h2, _Float16 __h3, _Float16 __h4, - _Float16 __h5, _Float16 __h6, _Float16 __h7, _Float16 __h8, - _Float16 __h9, _Float16 __h10, _Float16 __h11, _Float16 __h12, - _Float16 __h13, _Float16 __h14, _Float16 __h15, _Float16 __h16) { - return (__m256h)(__v16hf){__h16, __h15, __h14, __h13, __h12, __h11, - __h10, __h9, __h8, __h7, __h6, __h5, - __h4, __h3, __h2, __h1}; -} - -#define _mm_setr_ph(h1, h2, h3, h4, h5, h6, h7, h8) \ - _mm_set_ph((h8), (h7), (h6), (h5), (h4), (h3), (h2), (h1)) - -#define _mm256_setr_ph(h1, h2, h3, h4, h5, h6, h7, h8, h9, h10, h11, h12, h13, \ - h14, h15, h16) \ - _mm256_set_ph((h16), (h15), (h14), (h13), (h12), (h11), (h10), (h9), (h8), \ - (h7), (h6), (h5), (h4), (h3), (h2), (h1)) - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_add_ph(__m256h __A, - __m256h __B) { - return (__m256h)((__v16hf)__A + (__v16hf)__B); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_mask_add_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { - return (__m256h)__builtin_ia32_selectph_256( - __U, (__v16hf)_mm256_add_ph(__A, __B), (__v16hf)__W); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_maskz_add_ph(__mmask16 __U, __m256h __A, __m256h __B) { - return (__m256h)__builtin_ia32_selectph_256( - __U, (__v16hf)_mm256_add_ph(__A, __B), (__v16hf)_mm256_setzero_ph()); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_add_ph(__m128h __A, - __m128h __B) { - return (__m128h)((__v8hf)__A + (__v8hf)__B); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_add_ph(__m128h __W, - __mmask8 __U, - __m128h __A, - __m128h __B) { - return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_add_ph(__A, __B), - (__v8hf)__W); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_add_ph(__mmask8 __U, - __m128h __A, - __m128h __B) { - return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_add_ph(__A, __B), - (__v8hf)_mm_setzero_ph()); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_sub_ph(__m256h __A, - __m256h __B) { - return (__m256h)((__v16hf)__A - (__v16hf)__B); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_mask_sub_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { - return (__m256h)__builtin_ia32_selectph_256( - __U, (__v16hf)_mm256_sub_ph(__A, __B), (__v16hf)__W); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_maskz_sub_ph(__mmask16 __U, __m256h __A, __m256h __B) { - return (__m256h)__builtin_ia32_selectph_256( - __U, (__v16hf)_mm256_sub_ph(__A, __B), (__v16hf)_mm256_setzero_ph()); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sub_ph(__m128h __A, - __m128h __B) { - return (__m128h)((__v8hf)__A - (__v8hf)__B); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sub_ph(__m128h __W, - __mmask8 __U, - __m128h __A, - __m128h __B) { - return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_sub_ph(__A, __B), - (__v8hf)__W); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sub_ph(__mmask8 __U, - __m128h __A, - __m128h __B) { - return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_sub_ph(__A, __B), - (__v8hf)_mm_setzero_ph()); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_mul_ph(__m256h __A, - __m256h __B) { - return (__m256h)((__v16hf)__A * (__v16hf)__B); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_mask_mul_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { - return (__m256h)__builtin_ia32_selectph_256( - __U, (__v16hf)_mm256_mul_ph(__A, __B), (__v16hf)__W); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_maskz_mul_ph(__mmask16 __U, __m256h __A, __m256h __B) { - return (__m256h)__builtin_ia32_selectph_256( - __U, (__v16hf)_mm256_mul_ph(__A, __B), (__v16hf)_mm256_setzero_ph()); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mul_ph(__m128h __A, - __m128h __B) { - return (__m128h)((__v8hf)__A * (__v8hf)__B); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_mul_ph(__m128h __W, - __mmask8 __U, - __m128h __A, - __m128h __B) { - return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_mul_ph(__A, __B), - (__v8hf)__W); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_mul_ph(__mmask8 __U, - __m128h __A, - __m128h __B) { - return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_mul_ph(__A, __B), - (__v8hf)_mm_setzero_ph()); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_div_ph(__m256h __A, - __m256h __B) { - return (__m256h)((__v16hf)__A / (__v16hf)__B); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_mask_div_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { - return (__m256h)__builtin_ia32_selectph_256( - __U, (__v16hf)_mm256_div_ph(__A, __B), (__v16hf)__W); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_maskz_div_ph(__mmask16 __U, __m256h __A, __m256h __B) { - return (__m256h)__builtin_ia32_selectph_256( - __U, (__v16hf)_mm256_div_ph(__A, __B), (__v16hf)_mm256_setzero_ph()); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_div_ph(__m128h __A, - __m128h __B) { - return (__m128h)((__v8hf)__A / (__v8hf)__B); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_div_ph(__m128h __W, - __mmask8 __U, - __m128h __A, - __m128h __B) { - return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_div_ph(__A, __B), - (__v8hf)__W); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_div_ph(__mmask8 __U, - __m128h __A, - __m128h __B) { - return (__m128h)__builtin_ia32_selectph_128(__U, (__v8hf)_mm_div_ph(__A, __B), - (__v8hf)_mm_setzero_ph()); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_min_ph(__m256h __A, - __m256h __B) { - return (__m256h)__builtin_ia32_minph256((__v16hf)__A, (__v16hf)__B); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_mask_min_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { - return (__m256h)__builtin_ia32_selectph_256( - (__mmask16)__U, - (__v16hf)__builtin_ia32_minph256((__v16hf)__A, (__v16hf)__B), - (__v16hf)__W); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_maskz_min_ph(__mmask16 __U, __m256h __A, __m256h __B) { - return (__m256h)__builtin_ia32_selectph_256( - (__mmask16)__U, - (__v16hf)__builtin_ia32_minph256((__v16hf)__A, (__v16hf)__B), - (__v16hf)_mm256_setzero_ph()); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_min_ph(__m128h __A, - __m128h __B) { - return (__m128h)__builtin_ia32_minph128((__v8hf)__A, (__v8hf)__B); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_min_ph(__m128h __W, - __mmask8 __U, - __m128h __A, - __m128h __B) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, (__v8hf)__builtin_ia32_minph128((__v8hf)__A, (__v8hf)__B), - (__v8hf)__W); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_min_ph(__mmask8 __U, - __m128h __A, - __m128h __B) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, (__v8hf)__builtin_ia32_minph128((__v8hf)__A, (__v8hf)__B), - (__v8hf)_mm_setzero_ph()); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_max_ph(__m256h __A, - __m256h __B) { - return (__m256h)__builtin_ia32_maxph256((__v16hf)__A, (__v16hf)__B); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_mask_max_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { - return (__m256h)__builtin_ia32_selectph_256( - (__mmask16)__U, - (__v16hf)__builtin_ia32_maxph256((__v16hf)__A, (__v16hf)__B), - (__v16hf)__W); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_maskz_max_ph(__mmask16 __U, __m256h __A, __m256h __B) { - return (__m256h)__builtin_ia32_selectph_256( - (__mmask16)__U, - (__v16hf)__builtin_ia32_maxph256((__v16hf)__A, (__v16hf)__B), - (__v16hf)_mm256_setzero_ph()); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_max_ph(__m128h __A, - __m128h __B) { - return (__m128h)__builtin_ia32_maxph128((__v8hf)__A, (__v8hf)__B); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_max_ph(__m128h __W, - __mmask8 __U, - __m128h __A, - __m128h __B) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, (__v8hf)__builtin_ia32_maxph128((__v8hf)__A, (__v8hf)__B), - (__v8hf)__W); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_max_ph(__mmask8 __U, - __m128h __A, - __m128h __B) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, (__v8hf)__builtin_ia32_maxph128((__v8hf)__A, (__v8hf)__B), - (__v8hf)_mm_setzero_ph()); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_abs_ph(__m256h __A) { - return (__m256h)_mm256_and_epi32(_mm256_set1_epi32(0x7FFF7FFF), (__m256i)__A); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_abs_ph(__m128h __A) { - return (__m128h)_mm_and_epi32(_mm_set1_epi32(0x7FFF7FFF), (__m128i)__A); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_conj_pch(__m256h __A) { - return (__m256h)_mm256_xor_ps((__m256)__A, _mm256_set1_ps(-0.0f)); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_mask_conj_pch(__m256h __W, __mmask8 __U, __m256h __A) { - return (__m256h)__builtin_ia32_selectps_256( - (__mmask8)__U, (__v8sf)_mm256_conj_pch(__A), (__v8sf)__W); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_maskz_conj_pch(__mmask8 __U, __m256h __A) { - return (__m256h)__builtin_ia32_selectps_256( - (__mmask8)__U, (__v8sf)_mm256_conj_pch(__A), (__v8sf)_mm256_setzero_ps()); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_conj_pch(__m128h __A) { - return (__m128h)_mm_xor_ps((__m128)__A, _mm_set1_ps(-0.0f)); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_conj_pch(__m128h __W, - __mmask8 __U, - __m128h __A) { - return (__m128h)__builtin_ia32_selectps_128( - (__mmask8)__U, (__v4sf)_mm_conj_pch(__A), (__v4sf)__W); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_maskz_conj_pch(__mmask8 __U, __m128h __A) { - return (__m128h)__builtin_ia32_selectps_128( - (__mmask8)__U, (__v4sf)_mm_conj_pch(__A), (__v4sf)_mm_setzero_ps()); -} - -#define _mm256_cmp_ph_mask(a, b, p) \ - ((__mmask16)__builtin_ia32_cmpph256_mask( \ - (__v16hf)(__m256h)(a), (__v16hf)(__m256h)(b), (int)(p), (__mmask16)-1)) - -#define _mm256_mask_cmp_ph_mask(m, a, b, p) \ - ((__mmask16)__builtin_ia32_cmpph256_mask( \ - (__v16hf)(__m256h)(a), (__v16hf)(__m256h)(b), (int)(p), (__mmask16)(m))) - -#define _mm_cmp_ph_mask(a, b, p) \ - ((__mmask8)__builtin_ia32_cmpph128_mask( \ - (__v8hf)(__m128h)(a), (__v8hf)(__m128h)(b), (int)(p), (__mmask8)-1)) - -#define _mm_mask_cmp_ph_mask(m, a, b, p) \ - ((__mmask8)__builtin_ia32_cmpph128_mask( \ - (__v8hf)(__m128h)(a), (__v8hf)(__m128h)(b), (int)(p), (__mmask8)(m))) - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_rcp_ph(__m256h __A) { - return (__m256h)__builtin_ia32_rcpph256_mask( - (__v16hf)__A, (__v16hf)_mm256_undefined_ph(), (__mmask16)-1); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_mask_rcp_ph(__m256h __W, __mmask16 __U, __m256h __A) { - return (__m256h)__builtin_ia32_rcpph256_mask((__v16hf)__A, (__v16hf)__W, - (__mmask16)__U); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_maskz_rcp_ph(__mmask16 __U, __m256h __A) { - return (__m256h)__builtin_ia32_rcpph256_mask( - (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rcp_ph(__m128h __A) { - return (__m128h)__builtin_ia32_rcpph128_mask( - (__v8hf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rcp_ph(__m128h __W, - __mmask8 __U, - __m128h __A) { - return (__m128h)__builtin_ia32_rcpph128_mask((__v8hf)__A, (__v8hf)__W, - (__mmask8)__U); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_rcp_ph(__mmask8 __U, - __m128h __A) { - return (__m128h)__builtin_ia32_rcpph128_mask( - (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_rsqrt_ph(__m256h __A) { - return (__m256h)__builtin_ia32_rsqrtph256_mask( - (__v16hf)__A, (__v16hf)_mm256_undefined_ph(), (__mmask16)-1); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_mask_rsqrt_ph(__m256h __W, __mmask16 __U, __m256h __A) { - return (__m256h)__builtin_ia32_rsqrtph256_mask((__v16hf)__A, (__v16hf)__W, - (__mmask16)__U); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_maskz_rsqrt_ph(__mmask16 __U, __m256h __A) { - return (__m256h)__builtin_ia32_rsqrtph256_mask( - (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_rsqrt_ph(__m128h __A) { - return (__m128h)__builtin_ia32_rsqrtph128_mask( - (__v8hf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_rsqrt_ph(__m128h __W, - __mmask8 __U, - __m128h __A) { - return (__m128h)__builtin_ia32_rsqrtph128_mask((__v8hf)__A, (__v8hf)__W, - (__mmask8)__U); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_maskz_rsqrt_ph(__mmask8 __U, __m128h __A) { - return (__m128h)__builtin_ia32_rsqrtph128_mask( - (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_getexp_ph(__m128h __A) { - return (__m128h)__builtin_ia32_getexpph128_mask( - (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)-1); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_mask_getexp_ph(__m128h __W, __mmask8 __U, __m128h __A) { - return (__m128h)__builtin_ia32_getexpph128_mask((__v8hf)__A, (__v8hf)__W, - (__mmask8)__U); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_maskz_getexp_ph(__mmask8 __U, __m128h __A) { - return (__m128h)__builtin_ia32_getexpph128_mask( - (__v8hf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_getexp_ph(__m256h __A) { - return (__m256h)__builtin_ia32_getexpph256_mask( - (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_mask_getexp_ph(__m256h __W, __mmask16 __U, __m256h __A) { - return (__m256h)__builtin_ia32_getexpph256_mask((__v16hf)__A, (__v16hf)__W, - (__mmask16)__U); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_maskz_getexp_ph(__mmask16 __U, __m256h __A) { - return (__m256h)__builtin_ia32_getexpph256_mask( - (__v16hf)__A, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U); -} - -#define _mm_getmant_ph(A, B, C) \ - ((__m128h)__builtin_ia32_getmantph128_mask( \ - (__v8hf)(__m128h)(A), (int)(((C) << 2) | (B)), (__v8hf)_mm_setzero_ph(), \ - (__mmask8)-1)) - -#define _mm_mask_getmant_ph(W, U, A, B, C) \ - ((__m128h)__builtin_ia32_getmantph128_mask( \ - (__v8hf)(__m128h)(A), (int)(((C) << 2) | (B)), (__v8hf)(__m128h)(W), \ - (__mmask8)(U))) - -#define _mm_maskz_getmant_ph(U, A, B, C) \ - ((__m128h)__builtin_ia32_getmantph128_mask( \ - (__v8hf)(__m128h)(A), (int)(((C) << 2) | (B)), (__v8hf)_mm_setzero_ph(), \ - (__mmask8)(U))) - -#define _mm256_getmant_ph(A, B, C) \ - ((__m256h)__builtin_ia32_getmantph256_mask( \ - (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), \ - (__v16hf)_mm256_setzero_ph(), (__mmask16)-1)) - -#define _mm256_mask_getmant_ph(W, U, A, B, C) \ - ((__m256h)__builtin_ia32_getmantph256_mask( \ - (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), (__v16hf)(__m256h)(W), \ - (__mmask16)(U))) - -#define _mm256_maskz_getmant_ph(U, A, B, C) \ - ((__m256h)__builtin_ia32_getmantph256_mask( \ - (__v16hf)(__m256h)(A), (int)(((C) << 2) | (B)), \ - (__v16hf)_mm256_setzero_ph(), (__mmask16)(U))) - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_scalef_ph(__m128h __A, - __m128h __B) { - return (__m128h)__builtin_ia32_scalefph128_mask( - (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)-1); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_mask_scalef_ph(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { - return (__m128h)__builtin_ia32_scalefph128_mask((__v8hf)__A, (__v8hf)__B, - (__v8hf)__W, (__mmask8)__U); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_maskz_scalef_ph(__mmask8 __U, __m128h __A, __m128h __B) { - return (__m128h)__builtin_ia32_scalefph128_mask( - (__v8hf)__A, (__v8hf)__B, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_scalef_ph(__m256h __A, - __m256h __B) { - return (__m256h)__builtin_ia32_scalefph256_mask( - (__v16hf)__A, (__v16hf)__B, (__v16hf)_mm256_setzero_ph(), (__mmask16)-1); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_mask_scalef_ph(__m256h __W, __mmask16 __U, __m256h __A, __m256h __B) { - return (__m256h)__builtin_ia32_scalefph256_mask((__v16hf)__A, (__v16hf)__B, - (__v16hf)__W, (__mmask16)__U); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_maskz_scalef_ph(__mmask16 __U, __m256h __A, __m256h __B) { - return (__m256h)__builtin_ia32_scalefph256_mask( - (__v16hf)__A, (__v16hf)__B, (__v16hf)_mm256_setzero_ph(), (__mmask16)__U); -} - -#define _mm_roundscale_ph(A, imm) \ - ((__m128h)__builtin_ia32_rndscaleph_128_mask( \ - (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)_mm_setzero_ph(), \ - (__mmask8)-1)) - -#define _mm_mask_roundscale_ph(W, U, A, imm) \ - ((__m128h)__builtin_ia32_rndscaleph_128_mask( \ - (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)(__m128h)(W), (__mmask8)(U))) - -#define _mm_maskz_roundscale_ph(U, A, imm) \ - ((__m128h)__builtin_ia32_rndscaleph_128_mask( \ - (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)_mm_setzero_ph(), \ - (__mmask8)(U))) - -#define _mm256_roundscale_ph(A, imm) \ - ((__m256h)__builtin_ia32_rndscaleph_256_mask( \ - (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_setzero_ph(), \ - (__mmask16)-1)) - -#define _mm256_mask_roundscale_ph(W, U, A, imm) \ - ((__m256h)__builtin_ia32_rndscaleph_256_mask( \ - (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)(__m256h)(W), \ - (__mmask16)(U))) - -#define _mm256_maskz_roundscale_ph(U, A, imm) \ - ((__m256h)__builtin_ia32_rndscaleph_256_mask( \ - (__v16hf)(__m256h)(A), (int)(imm), (__v16hf)_mm256_setzero_ph(), \ - (__mmask16)(U))) - -#define _mm_reduce_ph(A, imm) \ - ((__m128h)__builtin_ia32_reduceph128_mask((__v8hf)(__m128h)(A), (int)(imm), \ - (__v8hf)_mm_setzero_ph(), \ - (__mmask8)-1)) - -#define _mm_mask_reduce_ph(W, U, A, imm) \ - ((__m128h)__builtin_ia32_reduceph128_mask( \ - (__v8hf)(__m128h)(A), (int)(imm), (__v8hf)(__m128h)(W), (__mmask8)(U))) - -#define _mm_maskz_reduce_ph(U, A, imm) \ - ((__m128h)__builtin_ia32_reduceph128_mask((__v8hf)(__m128h)(A), (int)(imm), \ - (__v8hf)_mm_setzero_ph(), \ - (__mmask8)(U))) - -#define _mm256_reduce_ph(A, imm) \ - ((__m256h)__builtin_ia32_reduceph256_mask((__v16hf)(__m256h)(A), (int)(imm), \ - (__v16hf)_mm256_setzero_ph(), \ - (__mmask16)-1)) - -#define _mm256_mask_reduce_ph(W, U, A, imm) \ - ((__m256h)__builtin_ia32_reduceph256_mask((__v16hf)(__m256h)(A), (int)(imm), \ - (__v16hf)(__m256h)(W), \ - (__mmask16)(U))) - -#define _mm256_maskz_reduce_ph(U, A, imm) \ - ((__m256h)__builtin_ia32_reduceph256_mask((__v16hf)(__m256h)(A), (int)(imm), \ - (__v16hf)_mm256_setzero_ph(), \ - (__mmask16)(U))) - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_sqrt_ph(__m128h __a) { - return __builtin_ia32_sqrtph((__v8hf)__a); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_sqrt_ph(__m128h __W, - __mmask8 __U, - __m128h __A) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, (__v8hf)_mm_sqrt_ph(__A), (__v8hf)__W); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_maskz_sqrt_ph(__mmask8 __U, - __m128h __A) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, (__v8hf)_mm_sqrt_ph(__A), (__v8hf)_mm_setzero_ph()); -} - -static __inline __m256h __DEFAULT_FN_ATTRS256 _mm256_sqrt_ph(__m256h __a) { - return (__m256h)__builtin_ia32_sqrtph256((__v16hf)__a); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_mask_sqrt_ph(__m256h __W, __mmask16 __U, __m256h __A) { - return (__m256h)__builtin_ia32_selectph_256( - (__mmask16)__U, (__v16hf)_mm256_sqrt_ph(__A), (__v16hf)__W); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_maskz_sqrt_ph(__mmask16 __U, __m256h __A) { - return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U, - (__v16hf)_mm256_sqrt_ph(__A), - (__v16hf)_mm256_setzero_ph()); -} - -#define _mm_mask_fpclass_ph_mask(U, A, imm) \ - ((__mmask8)__builtin_ia32_fpclassph128_mask((__v8hf)(__m128h)(A), \ - (int)(imm), (__mmask8)(U))) - -#define _mm_fpclass_ph_mask(A, imm) \ - ((__mmask8)__builtin_ia32_fpclassph128_mask((__v8hf)(__m128h)(A), \ - (int)(imm), (__mmask8)-1)) - -#define _mm256_mask_fpclass_ph_mask(U, A, imm) \ - ((__mmask16)__builtin_ia32_fpclassph256_mask((__v16hf)(__m256h)(A), \ - (int)(imm), (__mmask16)(U))) - -#define _mm256_fpclass_ph_mask(A, imm) \ - ((__mmask16)__builtin_ia32_fpclassph256_mask((__v16hf)(__m256h)(A), \ - (int)(imm), (__mmask16)-1)) - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtpd_ph(__m128d __A) { - return (__m128h)__builtin_ia32_vcvtpd2ph128_mask( - (__v2df)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtpd_ph(__m128h __W, - __mmask8 __U, - __m128d __A) { - return (__m128h)__builtin_ia32_vcvtpd2ph128_mask((__v2df)__A, (__v8hf)__W, - (__mmask8)__U); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtpd_ph(__mmask8 __U, __m128d __A) { - return (__m128h)__builtin_ia32_vcvtpd2ph128_mask( - (__v2df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS256 _mm256_cvtpd_ph(__m256d __A) { - return (__m128h)__builtin_ia32_vcvtpd2ph256_mask( - (__v4df)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtpd_ph(__m128h __W, __mmask8 __U, __m256d __A) { - return (__m128h)__builtin_ia32_vcvtpd2ph256_mask((__v4df)__A, (__v8hf)__W, - (__mmask8)__U); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtpd_ph(__mmask8 __U, __m256d __A) { - return (__m128h)__builtin_ia32_vcvtpd2ph256_mask( - (__v4df)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_cvtph_pd(__m128h __A) { - return (__m128d)__builtin_ia32_vcvtph2pd128_mask( - (__v8hf)__A, (__v2df)_mm_undefined_pd(), (__mmask8)-1); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 _mm_mask_cvtph_pd(__m128d __W, - __mmask8 __U, - __m128h __A) { - return (__m128d)__builtin_ia32_vcvtph2pd128_mask((__v8hf)__A, (__v2df)__W, - (__mmask8)__U); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtph_pd(__mmask8 __U, __m128h __A) { - return (__m128d)__builtin_ia32_vcvtph2pd128_mask( - (__v8hf)__A, (__v2df)_mm_setzero_pd(), (__mmask8)__U); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 _mm256_cvtph_pd(__m128h __A) { - return (__m256d)__builtin_ia32_vcvtph2pd256_mask( - (__v8hf)__A, (__v4df)_mm256_undefined_pd(), (__mmask8)-1); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtph_pd(__m256d __W, __mmask8 __U, __m128h __A) { - return (__m256d)__builtin_ia32_vcvtph2pd256_mask((__v8hf)__A, (__v4df)__W, - (__mmask8)__U); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtph_pd(__mmask8 __U, __m128h __A) { - return (__m256d)__builtin_ia32_vcvtph2pd256_mask( - (__v8hf)__A, (__v4df)_mm256_setzero_pd(), (__mmask8)__U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epi16(__m128h __A) { - return (__m128i)__builtin_ia32_vcvtph2w128_mask( - (__v8hf)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvtph_epi16(__m128i __W, __mmask8 __U, __m128h __A) { - return (__m128i)__builtin_ia32_vcvtph2w128_mask((__v8hf)__A, (__v8hi)__W, - (__mmask8)__U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtph_epi16(__mmask8 __U, __m128h __A) { - return (__m128i)__builtin_ia32_vcvtph2w128_mask( - (__v8hf)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_cvtph_epi16(__m256h __A) { - return (__m256i)__builtin_ia32_vcvtph2w256_mask( - (__v16hf)__A, (__v16hi)_mm256_undefined_si256(), (__mmask16)-1); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtph_epi16(__m256i __W, __mmask16 __U, __m256h __A) { - return (__m256i)__builtin_ia32_vcvtph2w256_mask((__v16hf)__A, (__v16hi)__W, - (__mmask16)__U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtph_epi16(__mmask16 __U, __m256h __A) { - return (__m256i)__builtin_ia32_vcvtph2w256_mask( - (__v16hf)__A, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epi16(__m128h __A) { - return (__m128i)__builtin_ia32_vcvttph2w128_mask( - (__v8hf)__A, (__v8hi)_mm_undefined_si128(), (__mmask8)-1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvttph_epi16(__m128i __W, __mmask8 __U, __m128h __A) { - return (__m128i)__builtin_ia32_vcvttph2w128_mask((__v8hf)__A, (__v8hi)__W, - (__mmask8)__U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvttph_epi16(__mmask8 __U, __m128h __A) { - return (__m128i)__builtin_ia32_vcvttph2w128_mask( - (__v8hf)__A, (__v8hi)_mm_setzero_si128(), (__mmask8)__U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_cvttph_epi16(__m256h __A) { - return (__m256i)__builtin_ia32_vcvttph2w256_mask( - (__v16hf)__A, (__v16hi)_mm256_undefined_si256(), (__mmask16)-1); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_cvttph_epi16(__m256i __W, __mmask16 __U, __m256h __A) { - return (__m256i)__builtin_ia32_vcvttph2w256_mask((__v16hf)__A, (__v16hi)__W, - (__mmask16)__U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvttph_epi16(__mmask16 __U, __m256h __A) { - return (__m256i)__builtin_ia32_vcvttph2w256_mask( - (__v16hf)__A, (__v16hi)_mm256_setzero_si256(), (__mmask16)__U); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepi16_ph(__m128i __A) { - return (__m128h) __builtin_convertvector((__v8hi)__A, __v8hf); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_mask_cvtepi16_ph(__m128h __W, __mmask8 __U, __m128i __A) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, (__v8hf)_mm_cvtepi16_ph(__A), (__v8hf)__W); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtepi16_ph(__mmask8 __U, __m128i __A) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, (__v8hf)_mm_cvtepi16_ph(__A), (__v8hf)_mm_setzero_ph()); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_cvtepi16_ph(__m256i __A) { - return (__m256h) __builtin_convertvector((__v16hi)__A, __v16hf); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtepi16_ph(__m256h __W, __mmask16 __U, __m256i __A) { - return (__m256h)__builtin_ia32_selectph_256( - (__mmask16)__U, (__v16hf)_mm256_cvtepi16_ph(__A), (__v16hf)__W); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtepi16_ph(__mmask16 __U, __m256i __A) { - return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U, - (__v16hf)_mm256_cvtepi16_ph(__A), - (__v16hf)_mm256_setzero_ph()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epu16(__m128h __A) { - return (__m128i)__builtin_ia32_vcvtph2uw128_mask( - (__v8hf)__A, (__v8hu)_mm_undefined_si128(), (__mmask8)-1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvtph_epu16(__m128i __W, __mmask8 __U, __m128h __A) { - return (__m128i)__builtin_ia32_vcvtph2uw128_mask((__v8hf)__A, (__v8hu)__W, - (__mmask8)__U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtph_epu16(__mmask8 __U, __m128h __A) { - return (__m128i)__builtin_ia32_vcvtph2uw128_mask( - (__v8hf)__A, (__v8hu)_mm_setzero_si128(), (__mmask8)__U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_cvtph_epu16(__m256h __A) { - return (__m256i)__builtin_ia32_vcvtph2uw256_mask( - (__v16hf)__A, (__v16hu)_mm256_undefined_si256(), (__mmask16)-1); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtph_epu16(__m256i __W, __mmask16 __U, __m256h __A) { - return (__m256i)__builtin_ia32_vcvtph2uw256_mask((__v16hf)__A, (__v16hu)__W, - (__mmask16)__U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtph_epu16(__mmask16 __U, __m256h __A) { - return (__m256i)__builtin_ia32_vcvtph2uw256_mask( - (__v16hf)__A, (__v16hu)_mm256_setzero_si256(), (__mmask16)__U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epu16(__m128h __A) { - return (__m128i)__builtin_ia32_vcvttph2uw128_mask( - (__v8hf)__A, (__v8hu)_mm_undefined_si128(), (__mmask8)-1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvttph_epu16(__m128i __W, __mmask8 __U, __m128h __A) { - return (__m128i)__builtin_ia32_vcvttph2uw128_mask((__v8hf)__A, (__v8hu)__W, - (__mmask8)__U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvttph_epu16(__mmask8 __U, __m128h __A) { - return (__m128i)__builtin_ia32_vcvttph2uw128_mask( - (__v8hf)__A, (__v8hu)_mm_setzero_si128(), (__mmask8)__U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_cvttph_epu16(__m256h __A) { - return (__m256i)__builtin_ia32_vcvttph2uw256_mask( - (__v16hf)__A, (__v16hu)_mm256_undefined_si256(), (__mmask16)-1); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_cvttph_epu16(__m256i __W, __mmask16 __U, __m256h __A) { - return (__m256i)__builtin_ia32_vcvttph2uw256_mask((__v16hf)__A, (__v16hu)__W, - (__mmask16)__U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvttph_epu16(__mmask16 __U, __m256h __A) { - return (__m256i)__builtin_ia32_vcvttph2uw256_mask( - (__v16hf)__A, (__v16hu)_mm256_setzero_si256(), (__mmask16)__U); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepu16_ph(__m128i __A) { - return (__m128h) __builtin_convertvector((__v8hu)__A, __v8hf); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_mask_cvtepu16_ph(__m128h __W, __mmask8 __U, __m128i __A) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, (__v8hf)_mm_cvtepu16_ph(__A), (__v8hf)__W); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtepu16_ph(__mmask8 __U, __m128i __A) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, (__v8hf)_mm_cvtepu16_ph(__A), (__v8hf)_mm_setzero_ph()); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_cvtepu16_ph(__m256i __A) { - return (__m256h) __builtin_convertvector((__v16hu)__A, __v16hf); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtepu16_ph(__m256h __W, __mmask16 __U, __m256i __A) { - return (__m256h)__builtin_ia32_selectph_256( - (__mmask16)__U, (__v16hf)_mm256_cvtepu16_ph(__A), (__v16hf)__W); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtepu16_ph(__mmask16 __U, __m256i __A) { - return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U, - (__v16hf)_mm256_cvtepu16_ph(__A), - (__v16hf)_mm256_setzero_ph()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epi32(__m128h __A) { - return (__m128i)__builtin_ia32_vcvtph2dq128_mask( - (__v8hf)__A, (__v4si)_mm_undefined_si128(), (__mmask8)-1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvtph_epi32(__m128i __W, __mmask8 __U, __m128h __A) { - return (__m128i)__builtin_ia32_vcvtph2dq128_mask((__v8hf)__A, (__v4si)__W, - (__mmask8)__U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtph_epi32(__mmask8 __U, __m128h __A) { - return (__m128i)__builtin_ia32_vcvtph2dq128_mask( - (__v8hf)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_cvtph_epi32(__m128h __A) { - return (__m256i)__builtin_ia32_vcvtph2dq256_mask( - (__v8hf)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtph_epi32(__m256i __W, __mmask8 __U, __m128h __A) { - return (__m256i)__builtin_ia32_vcvtph2dq256_mask((__v8hf)__A, (__v8si)__W, - (__mmask8)__U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtph_epi32(__mmask8 __U, __m128h __A) { - return (__m256i)__builtin_ia32_vcvtph2dq256_mask( - (__v8hf)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epu32(__m128h __A) { - return (__m128i)__builtin_ia32_vcvtph2udq128_mask( - (__v8hf)__A, (__v4su)_mm_undefined_si128(), (__mmask8)-1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvtph_epu32(__m128i __W, __mmask8 __U, __m128h __A) { - return (__m128i)__builtin_ia32_vcvtph2udq128_mask((__v8hf)__A, (__v4su)__W, - (__mmask8)__U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtph_epu32(__mmask8 __U, __m128h __A) { - return (__m128i)__builtin_ia32_vcvtph2udq128_mask( - (__v8hf)__A, (__v4su)_mm_setzero_si128(), (__mmask8)__U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_cvtph_epu32(__m128h __A) { - return (__m256i)__builtin_ia32_vcvtph2udq256_mask( - (__v8hf)__A, (__v8su)_mm256_undefined_si256(), (__mmask8)-1); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtph_epu32(__m256i __W, __mmask8 __U, __m128h __A) { - return (__m256i)__builtin_ia32_vcvtph2udq256_mask((__v8hf)__A, (__v8su)__W, - (__mmask8)__U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtph_epu32(__mmask8 __U, __m128h __A) { - return (__m256i)__builtin_ia32_vcvtph2udq256_mask( - (__v8hf)__A, (__v8su)_mm256_setzero_si256(), (__mmask8)__U); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepi32_ph(__m128i __A) { - return (__m128h)__builtin_ia32_vcvtdq2ph128_mask( - (__v4si)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_mask_cvtepi32_ph(__m128h __W, __mmask8 __U, __m128i __A) { - return (__m128h)__builtin_ia32_vcvtdq2ph128_mask((__v4si)__A, (__v8hf)__W, - (__mmask8)__U); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtepi32_ph(__mmask8 __U, __m128i __A) { - return (__m128h)__builtin_ia32_vcvtdq2ph128_mask( - (__v4si)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS256 -_mm256_cvtepi32_ph(__m256i __A) { - return (__m128h) __builtin_convertvector((__v8si)__A, __v8hf); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtepi32_ph(__m128h __W, __mmask8 __U, __m256i __A) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, (__v8hf)_mm256_cvtepi32_ph(__A), (__v8hf)__W); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtepi32_ph(__mmask8 __U, __m256i __A) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, (__v8hf)_mm256_cvtepi32_ph(__A), (__v8hf)_mm_setzero_ph()); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepu32_ph(__m128i __A) { - return (__m128h)__builtin_ia32_vcvtudq2ph128_mask( - (__v4su)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_mask_cvtepu32_ph(__m128h __W, __mmask8 __U, __m128i __A) { - return (__m128h)__builtin_ia32_vcvtudq2ph128_mask((__v4su)__A, (__v8hf)__W, - (__mmask8)__U); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtepu32_ph(__mmask8 __U, __m128i __A) { - return (__m128h)__builtin_ia32_vcvtudq2ph128_mask( - (__v4su)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS256 -_mm256_cvtepu32_ph(__m256i __A) { - return (__m128h) __builtin_convertvector((__v8su)__A, __v8hf); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtepu32_ph(__m128h __W, __mmask8 __U, __m256i __A) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, (__v8hf)_mm256_cvtepu32_ph(__A), (__v8hf)__W); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtepu32_ph(__mmask8 __U, __m256i __A) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, (__v8hf)_mm256_cvtepu32_ph(__A), (__v8hf)_mm_setzero_ph()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epi32(__m128h __A) { - return (__m128i)__builtin_ia32_vcvttph2dq128_mask( - (__v8hf)__A, (__v4si)_mm_undefined_si128(), (__mmask8)-1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvttph_epi32(__m128i __W, __mmask8 __U, __m128h __A) { - return (__m128i)__builtin_ia32_vcvttph2dq128_mask((__v8hf)__A, (__v4si)__W, - (__mmask8)__U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvttph_epi32(__mmask8 __U, __m128h __A) { - return (__m128i)__builtin_ia32_vcvttph2dq128_mask( - (__v8hf)__A, (__v4si)_mm_setzero_si128(), (__mmask8)__U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_cvttph_epi32(__m128h __A) { - return (__m256i)__builtin_ia32_vcvttph2dq256_mask( - (__v8hf)__A, (__v8si)_mm256_undefined_si256(), (__mmask8)-1); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_cvttph_epi32(__m256i __W, __mmask8 __U, __m128h __A) { - return (__m256i)__builtin_ia32_vcvttph2dq256_mask((__v8hf)__A, (__v8si)__W, - (__mmask8)__U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvttph_epi32(__mmask8 __U, __m128h __A) { - return (__m256i)__builtin_ia32_vcvttph2dq256_mask( - (__v8hf)__A, (__v8si)_mm256_setzero_si256(), (__mmask8)__U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epu32(__m128h __A) { - return (__m128i)__builtin_ia32_vcvttph2udq128_mask( - (__v8hf)__A, (__v4su)_mm_undefined_si128(), (__mmask8)-1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvttph_epu32(__m128i __W, __mmask8 __U, __m128h __A) { - return (__m128i)__builtin_ia32_vcvttph2udq128_mask((__v8hf)__A, (__v4su)__W, - (__mmask8)__U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvttph_epu32(__mmask8 __U, __m128h __A) { - return (__m128i)__builtin_ia32_vcvttph2udq128_mask( - (__v8hf)__A, (__v4su)_mm_setzero_si128(), (__mmask8)__U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_cvttph_epu32(__m128h __A) { - return (__m256i)__builtin_ia32_vcvttph2udq256_mask( - (__v8hf)__A, (__v8su)_mm256_undefined_si256(), (__mmask8)-1); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_cvttph_epu32(__m256i __W, __mmask8 __U, __m128h __A) { - return (__m256i)__builtin_ia32_vcvttph2udq256_mask((__v8hf)__A, (__v8su)__W, - (__mmask8)__U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvttph_epu32(__mmask8 __U, __m128h __A) { - return (__m256i)__builtin_ia32_vcvttph2udq256_mask( - (__v8hf)__A, (__v8su)_mm256_setzero_si256(), (__mmask8)__U); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepi64_ph(__m128i __A) { - return (__m128h)__builtin_ia32_vcvtqq2ph128_mask( - (__v2di)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_mask_cvtepi64_ph(__m128h __W, __mmask8 __U, __m128i __A) { - return (__m128h)__builtin_ia32_vcvtqq2ph128_mask((__v2di)__A, (__v8hf)__W, - (__mmask8)__U); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtepi64_ph(__mmask8 __U, __m128i __A) { - return (__m128h)__builtin_ia32_vcvtqq2ph128_mask( - (__v2di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS256 -_mm256_cvtepi64_ph(__m256i __A) { - return (__m128h)__builtin_ia32_vcvtqq2ph256_mask( - (__v4di)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtepi64_ph(__m128h __W, __mmask8 __U, __m256i __A) { - return (__m128h)__builtin_ia32_vcvtqq2ph256_mask((__v4di)__A, (__v8hf)__W, - (__mmask8)__U); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtepi64_ph(__mmask8 __U, __m256i __A) { - return (__m128h)__builtin_ia32_vcvtqq2ph256_mask( - (__v4di)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epi64(__m128h __A) { - return (__m128i)__builtin_ia32_vcvtph2qq128_mask( - (__v8hf)__A, (__v2di)_mm_undefined_si128(), (__mmask8)-1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvtph_epi64(__m128i __W, __mmask8 __U, __m128h __A) { - return (__m128i)__builtin_ia32_vcvtph2qq128_mask((__v8hf)__A, (__v2di)__W, - (__mmask8)__U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtph_epi64(__mmask8 __U, __m128h __A) { - return (__m128i)__builtin_ia32_vcvtph2qq128_mask( - (__v8hf)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_cvtph_epi64(__m128h __A) { - return (__m256i)__builtin_ia32_vcvtph2qq256_mask( - (__v8hf)__A, (__v4di)_mm256_undefined_si256(), (__mmask8)-1); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtph_epi64(__m256i __W, __mmask8 __U, __m128h __A) { - return (__m256i)__builtin_ia32_vcvtph2qq256_mask((__v8hf)__A, (__v4di)__W, - (__mmask8)__U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtph_epi64(__mmask8 __U, __m128h __A) { - return (__m256i)__builtin_ia32_vcvtph2qq256_mask( - (__v8hf)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtepu64_ph(__m128i __A) { - return (__m128h)__builtin_ia32_vcvtuqq2ph128_mask( - (__v2du)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_mask_cvtepu64_ph(__m128h __W, __mmask8 __U, __m128i __A) { - return (__m128h)__builtin_ia32_vcvtuqq2ph128_mask((__v2du)__A, (__v8hf)__W, - (__mmask8)__U); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtepu64_ph(__mmask8 __U, __m128i __A) { - return (__m128h)__builtin_ia32_vcvtuqq2ph128_mask( - (__v2du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS256 -_mm256_cvtepu64_ph(__m256i __A) { - return (__m128h)__builtin_ia32_vcvtuqq2ph256_mask( - (__v4du)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtepu64_ph(__m128h __W, __mmask8 __U, __m256i __A) { - return (__m128h)__builtin_ia32_vcvtuqq2ph256_mask((__v4du)__A, (__v8hf)__W, - (__mmask8)__U); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtepu64_ph(__mmask8 __U, __m256i __A) { - return (__m128h)__builtin_ia32_vcvtuqq2ph256_mask( - (__v4du)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvtph_epu64(__m128h __A) { - return (__m128i)__builtin_ia32_vcvtph2uqq128_mask( - (__v8hf)__A, (__v2du)_mm_undefined_si128(), (__mmask8)-1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvtph_epu64(__m128i __W, __mmask8 __U, __m128h __A) { - return (__m128i)__builtin_ia32_vcvtph2uqq128_mask((__v8hf)__A, (__v2du)__W, - (__mmask8)__U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtph_epu64(__mmask8 __U, __m128h __A) { - return (__m128i)__builtin_ia32_vcvtph2uqq128_mask( - (__v8hf)__A, (__v2du)_mm_setzero_si128(), (__mmask8)__U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_cvtph_epu64(__m128h __A) { - return (__m256i)__builtin_ia32_vcvtph2uqq256_mask( - (__v8hf)__A, (__v4du)_mm256_undefined_si256(), (__mmask8)-1); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtph_epu64(__m256i __W, __mmask8 __U, __m128h __A) { - return (__m256i)__builtin_ia32_vcvtph2uqq256_mask((__v8hf)__A, (__v4du)__W, - (__mmask8)__U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtph_epu64(__mmask8 __U, __m128h __A) { - return (__m256i)__builtin_ia32_vcvtph2uqq256_mask( - (__v8hf)__A, (__v4du)_mm256_setzero_si256(), (__mmask8)__U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epi64(__m128h __A) { - return (__m128i)__builtin_ia32_vcvttph2qq128_mask( - (__v8hf)__A, (__v2di)_mm_undefined_si128(), (__mmask8)-1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvttph_epi64(__m128i __W, __mmask8 __U, __m128h __A) { - return (__m128i)__builtin_ia32_vcvttph2qq128_mask((__v8hf)__A, (__v2di)__W, - (__mmask8)__U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvttph_epi64(__mmask8 __U, __m128h __A) { - return (__m128i)__builtin_ia32_vcvttph2qq128_mask( - (__v8hf)__A, (__v2di)_mm_setzero_si128(), (__mmask8)__U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_cvttph_epi64(__m128h __A) { - return (__m256i)__builtin_ia32_vcvttph2qq256_mask( - (__v8hf)__A, (__v4di)_mm256_undefined_si256(), (__mmask8)-1); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_cvttph_epi64(__m256i __W, __mmask8 __U, __m128h __A) { - return (__m256i)__builtin_ia32_vcvttph2qq256_mask((__v8hf)__A, (__v4di)__W, - (__mmask8)__U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvttph_epi64(__mmask8 __U, __m128h __A) { - return (__m256i)__builtin_ia32_vcvttph2qq256_mask( - (__v8hf)__A, (__v4di)_mm256_setzero_si256(), (__mmask8)__U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 _mm_cvttph_epu64(__m128h __A) { - return (__m128i)__builtin_ia32_vcvttph2uqq128_mask( - (__v8hf)__A, (__v2du)_mm_undefined_si128(), (__mmask8)-1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvttph_epu64(__m128i __W, __mmask8 __U, __m128h __A) { - return (__m128i)__builtin_ia32_vcvttph2uqq128_mask((__v8hf)__A, (__v2du)__W, - (__mmask8)__U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvttph_epu64(__mmask8 __U, __m128h __A) { - return (__m128i)__builtin_ia32_vcvttph2uqq128_mask( - (__v8hf)__A, (__v2du)_mm_setzero_si128(), (__mmask8)__U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_cvttph_epu64(__m128h __A) { - return (__m256i)__builtin_ia32_vcvttph2uqq256_mask( - (__v8hf)__A, (__v4du)_mm256_undefined_si256(), (__mmask8)-1); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_cvttph_epu64(__m256i __W, __mmask8 __U, __m128h __A) { - return (__m256i)__builtin_ia32_vcvttph2uqq256_mask((__v8hf)__A, (__v4du)__W, - (__mmask8)__U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvttph_epu64(__mmask8 __U, __m128h __A) { - return (__m256i)__builtin_ia32_vcvttph2uqq256_mask( - (__v8hf)__A, (__v4du)_mm256_setzero_si256(), (__mmask8)__U); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_cvtxph_ps(__m128h __A) { - return (__m128)__builtin_ia32_vcvtph2psx128_mask( - (__v8hf)__A, (__v4sf)_mm_undefined_ps(), (__mmask8)-1); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 _mm_mask_cvtxph_ps(__m128 __W, - __mmask8 __U, - __m128h __A) { - return (__m128)__builtin_ia32_vcvtph2psx128_mask((__v8hf)__A, (__v4sf)__W, - (__mmask8)__U); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtxph_ps(__mmask8 __U, __m128h __A) { - return (__m128)__builtin_ia32_vcvtph2psx128_mask( - (__v8hf)__A, (__v4sf)_mm_setzero_ps(), (__mmask8)__U); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 _mm256_cvtxph_ps(__m128h __A) { - return (__m256)__builtin_ia32_vcvtph2psx256_mask( - (__v8hf)__A, (__v8sf)_mm256_undefined_ps(), (__mmask8)-1); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtxph_ps(__m256 __W, __mmask8 __U, __m128h __A) { - return (__m256)__builtin_ia32_vcvtph2psx256_mask((__v8hf)__A, (__v8sf)__W, - (__mmask8)__U); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtxph_ps(__mmask8 __U, __m128h __A) { - return (__m256)__builtin_ia32_vcvtph2psx256_mask( - (__v8hf)__A, (__v8sf)_mm256_setzero_ps(), (__mmask8)__U); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_cvtxps_ph(__m128 __A) { - return (__m128h)__builtin_ia32_vcvtps2phx128_mask( - (__v4sf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_cvtxps_ph(__m128h __W, - __mmask8 __U, - __m128 __A) { - return (__m128h)__builtin_ia32_vcvtps2phx128_mask((__v4sf)__A, (__v8hf)__W, - (__mmask8)__U); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtxps_ph(__mmask8 __U, __m128 __A) { - return (__m128h)__builtin_ia32_vcvtps2phx128_mask( - (__v4sf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS256 _mm256_cvtxps_ph(__m256 __A) { - return (__m128h)__builtin_ia32_vcvtps2phx256_mask( - (__v8sf)__A, (__v8hf)_mm_undefined_ph(), (__mmask8)-1); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtxps_ph(__m128h __W, __mmask8 __U, __m256 __A) { - return (__m128h)__builtin_ia32_vcvtps2phx256_mask((__v8sf)__A, (__v8hf)__W, - (__mmask8)__U); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtxps_ph(__mmask8 __U, __m256 __A) { - return (__m128h)__builtin_ia32_vcvtps2phx256_mask( - (__v8sf)__A, (__v8hf)_mm_setzero_ph(), (__mmask8)__U); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_ph(__m128h __A, - __m128h __B, - __m128h __C) { - return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, - (__v8hf)__C); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmadd_ph(__m128h __A, - __mmask8 __U, - __m128h __B, - __m128h __C) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, - __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), - (__v8hf)__A); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_mask3_fmadd_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, - __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), - (__v8hf)__C); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_maskz_fmadd_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, - __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), - (__v8hf)_mm_setzero_ph()); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsub_ph(__m128h __A, - __m128h __B, - __m128h __C) { - return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, - -(__v8hf)__C); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmsub_ph(__m128h __A, - __mmask8 __U, - __m128h __B, - __m128h __C) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, _mm_fmsub_ph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), - (__v8hf)__A); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_maskz_fmsub_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, _mm_fmsub_ph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), - (__v8hf)_mm_setzero_ph()); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_mask3_fnmadd_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, - __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, (__v8hf)__C), - (__v8hf)__C); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_maskz_fnmadd_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, - __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, (__v8hf)__C), - (__v8hf)_mm_setzero_ph()); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_maskz_fnmsub_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, - __builtin_ia32_vfmaddph(-(__v8hf)__A, (__v8hf)__B, -(__v8hf)__C), - (__v8hf)_mm_setzero_ph()); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmadd_ph(__m256h __A, - __m256h __B, - __m256h __C) { - return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, - (__v16hf)__C); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_mask_fmadd_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) { - return (__m256h)__builtin_ia32_selectph_256( - (__mmask16)__U, - __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C), - (__v16hf)__A); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_mask3_fmadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) { - return (__m256h)__builtin_ia32_selectph_256( - (__mmask16)__U, - __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C), - (__v16hf)__C); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_maskz_fmadd_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) { - return (__m256h)__builtin_ia32_selectph_256( - (__mmask16)__U, - __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C), - (__v16hf)_mm256_setzero_ph()); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmsub_ph(__m256h __A, - __m256h __B, - __m256h __C) { - return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, - -(__v16hf)__C); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_mask_fmsub_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) { - return (__m256h)__builtin_ia32_selectph_256( - (__mmask16)__U, - __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), - (__v16hf)__A); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_maskz_fmsub_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) { - return (__m256h)__builtin_ia32_selectph_256( - (__mmask16)__U, - __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), - (__v16hf)_mm256_setzero_ph()); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_mask3_fnmadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) { - return (__m256h)__builtin_ia32_selectph_256( - (__mmask16)__U, - __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, (__v16hf)__C), - (__v16hf)__C); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_maskz_fnmadd_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) { - return (__m256h)__builtin_ia32_selectph_256( - (__mmask16)__U, - __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, (__v16hf)__C), - (__v16hf)_mm256_setzero_ph()); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_maskz_fnmsub_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) { - return (__m256h)__builtin_ia32_selectph_256( - (__mmask16)__U, - __builtin_ia32_vfmaddph256(-(__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), - (__v16hf)_mm256_setzero_ph()); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmaddsub_ph(__m128h __A, - __m128h __B, - __m128h __C) { - return (__m128h)__builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, - (__v8hf)__C); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_mask_fmaddsub_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, - __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), - (__v8hf)__A); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_mask3_fmaddsub_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, - __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), - (__v8hf)__C); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_maskz_fmaddsub_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, - __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, (__v8hf)__C), - (__v8hf)_mm_setzero_ph()); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmsubadd_ph(__m128h __A, - __m128h __B, - __m128h __C) { - return (__m128h)__builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, - -(__v8hf)__C); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_mask_fmsubadd_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, - __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C), - (__v8hf)__A); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_maskz_fmsubadd_ph(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, - __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C), - (__v8hf)_mm_setzero_ph()); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_fmaddsub_ph(__m256h __A, __m256h __B, __m256h __C) { - return (__m256h)__builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, - (__v16hf)__C); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_mask_fmaddsub_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) { - return (__m256h)__builtin_ia32_selectph_256( - (__mmask16)__U, - __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C), - (__v16hf)__A); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_mask3_fmaddsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) { - return (__m256h)__builtin_ia32_selectph_256( - (__mmask16)__U, - __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C), - (__v16hf)__C); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_maskz_fmaddsub_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) { - return (__m256h)__builtin_ia32_selectph_256( - (__mmask16)__U, - __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, (__v16hf)__C), - (__v16hf)_mm256_setzero_ph()); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_fmsubadd_ph(__m256h __A, __m256h __B, __m256h __C) { - return (__m256h)__builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, - -(__v16hf)__C); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_mask_fmsubadd_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) { - return (__m256h)__builtin_ia32_selectph_256( - (__mmask16)__U, - __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), - (__v16hf)__A); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_maskz_fmsubadd_ph(__mmask16 __U, __m256h __A, __m256h __B, __m256h __C) { - return (__m256h)__builtin_ia32_selectph_256( - (__mmask16)__U, - __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), - (__v16hf)_mm256_setzero_ph()); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_mask3_fmsub_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, - __builtin_ia32_vfmaddph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C), - (__v8hf)__C); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_mask3_fmsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) { - return (__m256h)__builtin_ia32_selectph_256( - (__mmask16)__U, - __builtin_ia32_vfmaddph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), - (__v16hf)__C); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_mask3_fmsubadd_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, - __builtin_ia32_vfmaddsubph((__v8hf)__A, (__v8hf)__B, -(__v8hf)__C), - (__v8hf)__C); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_mask3_fmsubadd_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) { - return (__m256h)__builtin_ia32_selectph_256( - (__mmask16)__U, - __builtin_ia32_vfmaddsubph256((__v16hf)__A, (__v16hf)__B, -(__v16hf)__C), - (__v16hf)__C); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmadd_ph(__m128h __A, - __m128h __B, - __m128h __C) { - return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, - (__v8hf)__C); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_mask_fnmadd_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, - __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, (__v8hf)__C), - (__v8hf)__A); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fnmadd_ph(__m256h __A, - __m256h __B, - __m256h __C) { - return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, - (__v16hf)__C); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_mask_fnmadd_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) { - return (__m256h)__builtin_ia32_selectph_256( - (__mmask16)__U, - __builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, (__v16hf)__C), - (__v16hf)__A); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fnmsub_ph(__m128h __A, - __m128h __B, - __m128h __C) { - return (__m128h)__builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, - -(__v8hf)__C); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_mask_fnmsub_ph(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, - __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C), - (__v8hf)__A); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_mask3_fnmsub_ph(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { - return (__m128h)__builtin_ia32_selectph_128( - (__mmask8)__U, - __builtin_ia32_vfmaddph((__v8hf)__A, -(__v8hf)__B, -(__v8hf)__C), - (__v8hf)__C); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fnmsub_ph(__m256h __A, - __m256h __B, - __m256h __C) { - return (__m256h)__builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, - -(__v16hf)__C); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_mask_fnmsub_ph(__m256h __A, __mmask16 __U, __m256h __B, __m256h __C) { - return (__m256h)__builtin_ia32_selectph_256( - (__mmask16)__U, - __builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, -(__v16hf)__C), - (__v16hf)__A); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_mask3_fnmsub_ph(__m256h __A, __m256h __B, __m256h __C, __mmask16 __U) { - return (__m256h)__builtin_ia32_selectph_256( - (__mmask16)__U, - __builtin_ia32_vfmaddph256((__v16hf)__A, -(__v16hf)__B, -(__v16hf)__C), - (__v16hf)__C); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmul_pch(__m128h __A, - __m128h __B) { - return (__m128h)__builtin_ia32_vfcmulcph128_mask( - (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_mask_fcmul_pch(__m128h __W, __mmask8 __U, __m128h __A, __m128h __B) { - return (__m128h)__builtin_ia32_vfcmulcph128_mask((__v4sf)__A, (__v4sf)__B, - (__v4sf)__W, (__mmask8)__U); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_maskz_fcmul_pch(__mmask8 __U, __m128h __A, __m128h __B) { - return (__m128h)__builtin_ia32_vfcmulcph128_mask( - (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS128 _mm256_fcmul_pch(__m256h __A, - __m256h __B) { - return (__m256h)__builtin_ia32_vfcmulcph256_mask( - (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_undefined_ph(), (__mmask8)-1); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_mask_fcmul_pch(__m256h __W, __mmask8 __U, __m256h __A, __m256h __B) { - return (__m256h)__builtin_ia32_vfcmulcph256_mask((__v8sf)__A, (__v8sf)__B, - (__v8sf)__W, (__mmask8)__U); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_maskz_fcmul_pch(__mmask8 __U, __m256h __A, __m256h __B) { - return (__m256h)__builtin_ia32_vfcmulcph256_mask( - (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ph(), (__mmask8)__U); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fcmadd_pch(__m128h __A, - __m128h __B, - __m128h __C) { - return (__m128h)__builtin_ia32_vfcmaddcph128_mask((__v4sf)__A, (__v4sf)__B, - (__v4sf)__C, (__mmask8)-1); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_mask_fcmadd_pch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { - return (__m128h)__builtin_ia32_selectps_128( - __U, - __builtin_ia32_vfcmaddcph128_mask((__v4sf)__A, (__v4sf)(__m128h)__B, - (__v4sf)__C, (__mmask8)__U), - (__v4sf)__A); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_mask3_fcmadd_pch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { - return (__m128h)__builtin_ia32_vfcmaddcph128_mask((__v4sf)__A, (__v4sf)__B, - (__v4sf)__C, (__mmask8)__U); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_maskz_fcmadd_pch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { - return (__m128h)__builtin_ia32_vfcmaddcph128_maskz( - (__v4sf)__A, (__v4sf)__B, (__v4sf)__C, (__mmask8)__U); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fcmadd_pch(__m256h __A, - __m256h __B, - __m256h __C) { - return (__m256h)__builtin_ia32_vfcmaddcph256_mask((__v8sf)__A, (__v8sf)__B, - (__v8sf)__C, (__mmask8)-1); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_mask_fcmadd_pch(__m256h __A, __mmask8 __U, __m256h __B, __m256h __C) { - return (__m256h)__builtin_ia32_selectps_256( - __U, - __builtin_ia32_vfcmaddcph256_mask((__v8sf)__A, (__v8sf)__B, (__v8sf)__C, - (__mmask8)__U), - (__v8sf)__A); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_mask3_fcmadd_pch(__m256h __A, __m256h __B, __m256h __C, __mmask8 __U) { - return (__m256h)__builtin_ia32_vfcmaddcph256_mask((__v8sf)__A, (__v8sf)__B, - (__v8sf)__C, (__mmask8)__U); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_maskz_fcmadd_pch(__mmask8 __U, __m256h __A, __m256h __B, __m256h __C) { - return (__m256h)__builtin_ia32_vfcmaddcph256_maskz( - (__v8sf)__A, (__v8sf)__B, (__v8sf)__C, (__mmask8)__U); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmul_pch(__m128h __A, - __m128h __B) { - return (__m128h)__builtin_ia32_vfmulcph128_mask( - (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_undefined_ph(), (__mmask8)-1); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_fmul_pch(__m128h __W, - __mmask8 __U, - __m128h __A, - __m128h __B) { - return (__m128h)__builtin_ia32_vfmulcph128_mask((__v4sf)__A, (__v4sf)__B, - (__v4sf)__W, (__mmask8)__U); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_maskz_fmul_pch(__mmask8 __U, __m128h __A, __m128h __B) { - return (__m128h)__builtin_ia32_vfmulcph128_mask( - (__v4sf)__A, (__v4sf)__B, (__v4sf)_mm_setzero_ph(), (__mmask8)__U); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmul_pch(__m256h __A, - __m256h __B) { - return (__m256h)__builtin_ia32_vfmulcph256_mask( - (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_undefined_ph(), (__mmask8)-1); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_mask_fmul_pch(__m256h __W, __mmask8 __U, __m256h __A, __m256h __B) { - return (__m256h)__builtin_ia32_vfmulcph256_mask((__v8sf)__A, (__v8sf)__B, - (__v8sf)__W, (__mmask8)__U); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_maskz_fmul_pch(__mmask8 __U, __m256h __A, __m256h __B) { - return (__m256h)__builtin_ia32_vfmulcph256_mask( - (__v8sf)__A, (__v8sf)__B, (__v8sf)_mm256_setzero_ph(), (__mmask8)__U); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_fmadd_pch(__m128h __A, - __m128h __B, - __m128h __C) { - return (__m128h)__builtin_ia32_vfmaddcph128_mask((__v4sf)__A, (__v4sf)__B, - (__v4sf)__C, (__mmask8)-1); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_mask_fmadd_pch(__m128h __A, __mmask8 __U, __m128h __B, __m128h __C) { - return (__m128h)__builtin_ia32_selectps_128( - __U, - __builtin_ia32_vfmaddcph128_mask((__v4sf)__A, (__v4sf)__B, (__v4sf)__C, - (__mmask8)__U), - (__v4sf)__A); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_mask3_fmadd_pch(__m128h __A, __m128h __B, __m128h __C, __mmask8 __U) { - return (__m128h)__builtin_ia32_vfmaddcph128_mask((__v4sf)__A, (__v4sf)__B, - (__v4sf)__C, (__mmask8)__U); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_maskz_fmadd_pch(__mmask8 __U, __m128h __A, __m128h __B, __m128h __C) { - return (__m128h)__builtin_ia32_vfmaddcph128_maskz((__v4sf)__A, (__v4sf)__B, - (__v4sf)__C, (__mmask8)__U); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 _mm256_fmadd_pch(__m256h __A, - __m256h __B, - __m256h __C) { - return (__m256h)__builtin_ia32_vfmaddcph256_mask((__v8sf)__A, (__v8sf)__B, - (__v8sf)__C, (__mmask8)-1); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_mask_fmadd_pch(__m256h __A, __mmask8 __U, __m256h __B, __m256h __C) { - return (__m256h)__builtin_ia32_selectps_256( - __U, - __builtin_ia32_vfmaddcph256_mask((__v8sf)__A, (__v8sf)__B, (__v8sf)__C, - (__mmask8)__U), - (__v8sf)__A); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_mask3_fmadd_pch(__m256h __A, __m256h __B, __m256h __C, __mmask8 __U) { - return (__m256h)__builtin_ia32_vfmaddcph256_mask((__v8sf)__A, (__v8sf)__B, - (__v8sf)__C, (__mmask8)__U); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_maskz_fmadd_pch(__mmask8 __U, __m256h __A, __m256h __B, __m256h __C) { - return (__m256h)__builtin_ia32_vfmaddcph256_maskz((__v8sf)__A, (__v8sf)__B, - (__v8sf)__C, (__mmask8)__U); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 _mm_mask_blend_ph(__mmask8 __U, - __m128h __A, - __m128h __W) { - return (__m128h)__builtin_ia32_selectph_128((__mmask8)__U, (__v8hf)__W, - (__v8hf)__A); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_mask_blend_ph(__mmask16 __U, __m256h __A, __m256h __W) { - return (__m256h)__builtin_ia32_selectph_256((__mmask16)__U, (__v16hf)__W, - (__v16hf)__A); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_permutex2var_ph(__m128h __A, __m128i __I, __m128h __B) { - return (__m128h)__builtin_ia32_vpermi2varhi128((__v8hi)__A, (__v8hi)__I, - (__v8hi)__B); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_permutex2var_ph(__m256h __A, __m256i __I, __m256h __B) { - return (__m256h)__builtin_ia32_vpermi2varhi256((__v16hi)__A, (__v16hi)__I, - (__v16hi)__B); -} - -static __inline__ __m128h __DEFAULT_FN_ATTRS128 -_mm_permutexvar_ph(__m128i __A, __m128h __B) { - return (__m128h)__builtin_ia32_permvarhi128((__v8hi)__B, (__v8hi)__A); -} - -static __inline__ __m256h __DEFAULT_FN_ATTRS256 -_mm256_permutexvar_ph(__m256i __A, __m256h __B) { - return (__m256h)__builtin_ia32_permvarhi256((__v16hi)__B, (__v16hi)__A); -} - -static __inline__ _Float16 __DEFAULT_FN_ATTRS256 -_mm256_reduce_add_ph(__m256h __W) { - return __builtin_ia32_reduce_fadd_ph256(-0.0f16, __W); -} - -static __inline__ _Float16 __DEFAULT_FN_ATTRS256 -_mm256_reduce_mul_ph(__m256h __W) { - return __builtin_ia32_reduce_fmul_ph256(1.0f16, __W); -} - -static __inline__ _Float16 __DEFAULT_FN_ATTRS256 -_mm256_reduce_max_ph(__m256h __V) { - return __builtin_ia32_reduce_fmax_ph256(__V); -} - -static __inline__ _Float16 __DEFAULT_FN_ATTRS256 -_mm256_reduce_min_ph(__m256h __V) { - return __builtin_ia32_reduce_fmin_ph256(__V); -} - -static __inline__ _Float16 __DEFAULT_FN_ATTRS128 -_mm_reduce_add_ph(__m128h __W) { - return __builtin_ia32_reduce_fadd_ph128(-0.0f16, __W); -} - -static __inline__ _Float16 __DEFAULT_FN_ATTRS128 -_mm_reduce_mul_ph(__m128h __W) { - return __builtin_ia32_reduce_fmul_ph128(1.0f16, __W); -} - -static __inline__ _Float16 __DEFAULT_FN_ATTRS128 -_mm_reduce_max_ph(__m128h __V) { - return __builtin_ia32_reduce_fmax_ph128(__V); -} - -static __inline__ _Float16 __DEFAULT_FN_ATTRS128 -_mm_reduce_min_ph(__m128h __V) { - return __builtin_ia32_reduce_fmin_ph128(__V); -} - -// intrinsics below are alias for f*mul_*ch -#define _mm_mul_pch(A, B) _mm_fmul_pch(A, B) -#define _mm_mask_mul_pch(W, U, A, B) _mm_mask_fmul_pch(W, U, A, B) -#define _mm_maskz_mul_pch(U, A, B) _mm_maskz_fmul_pch(U, A, B) -#define _mm256_mul_pch(A, B) _mm256_fmul_pch(A, B) -#define _mm256_mask_mul_pch(W, U, A, B) _mm256_mask_fmul_pch(W, U, A, B) -#define _mm256_maskz_mul_pch(U, A, B) _mm256_maskz_fmul_pch(U, A, B) - -#define _mm_cmul_pch(A, B) _mm_fcmul_pch(A, B) -#define _mm_mask_cmul_pch(W, U, A, B) _mm_mask_fcmul_pch(W, U, A, B) -#define _mm_maskz_cmul_pch(U, A, B) _mm_maskz_fcmul_pch(U, A, B) -#define _mm256_cmul_pch(A, B) _mm256_fcmul_pch(A, B) -#define _mm256_mask_cmul_pch(W, U, A, B) _mm256_mask_fcmul_pch(W, U, A, B) -#define _mm256_maskz_cmul_pch(U, A, B) _mm256_maskz_fcmul_pch(U, A, B) - -#undef __DEFAULT_FN_ATTRS128 -#undef __DEFAULT_FN_ATTRS256 - -#endif diff --git a/include/avx512vlintrin.h b/include/avx512vlintrin.h deleted file mode 100644 index c6b4a44..0000000 --- a/include/avx512vlintrin.h +++ /dev/null @@ -1,8485 +0,0 @@ -/*===---- avx512vlintrin.h - AVX512VL intrinsics ---------------------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ - -#ifndef __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __AVX512VLINTRIN_H -#define __AVX512VLINTRIN_H - -#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl"), __min_vector_width__(128))) -#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl"), __min_vector_width__(256))) - -typedef short __v2hi __attribute__((__vector_size__(4))); -typedef char __v4qi __attribute__((__vector_size__(4))); -typedef char __v2qi __attribute__((__vector_size__(2))); - -/* Integer compare */ - -#define _mm_cmpeq_epi32_mask(A, B) \ - _mm_cmp_epi32_mask((A), (B), _MM_CMPINT_EQ) -#define _mm_mask_cmpeq_epi32_mask(k, A, B) \ - _mm_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_EQ) -#define _mm_cmpge_epi32_mask(A, B) \ - _mm_cmp_epi32_mask((A), (B), _MM_CMPINT_GE) -#define _mm_mask_cmpge_epi32_mask(k, A, B) \ - _mm_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GE) -#define _mm_cmpgt_epi32_mask(A, B) \ - _mm_cmp_epi32_mask((A), (B), _MM_CMPINT_GT) -#define _mm_mask_cmpgt_epi32_mask(k, A, B) \ - _mm_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GT) -#define _mm_cmple_epi32_mask(A, B) \ - _mm_cmp_epi32_mask((A), (B), _MM_CMPINT_LE) -#define _mm_mask_cmple_epi32_mask(k, A, B) \ - _mm_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LE) -#define _mm_cmplt_epi32_mask(A, B) \ - _mm_cmp_epi32_mask((A), (B), _MM_CMPINT_LT) -#define _mm_mask_cmplt_epi32_mask(k, A, B) \ - _mm_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LT) -#define _mm_cmpneq_epi32_mask(A, B) \ - _mm_cmp_epi32_mask((A), (B), _MM_CMPINT_NE) -#define _mm_mask_cmpneq_epi32_mask(k, A, B) \ - _mm_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_NE) - -#define _mm256_cmpeq_epi32_mask(A, B) \ - _mm256_cmp_epi32_mask((A), (B), _MM_CMPINT_EQ) -#define _mm256_mask_cmpeq_epi32_mask(k, A, B) \ - _mm256_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_EQ) -#define _mm256_cmpge_epi32_mask(A, B) \ - _mm256_cmp_epi32_mask((A), (B), _MM_CMPINT_GE) -#define _mm256_mask_cmpge_epi32_mask(k, A, B) \ - _mm256_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GE) -#define _mm256_cmpgt_epi32_mask(A, B) \ - _mm256_cmp_epi32_mask((A), (B), _MM_CMPINT_GT) -#define _mm256_mask_cmpgt_epi32_mask(k, A, B) \ - _mm256_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_GT) -#define _mm256_cmple_epi32_mask(A, B) \ - _mm256_cmp_epi32_mask((A), (B), _MM_CMPINT_LE) -#define _mm256_mask_cmple_epi32_mask(k, A, B) \ - _mm256_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LE) -#define _mm256_cmplt_epi32_mask(A, B) \ - _mm256_cmp_epi32_mask((A), (B), _MM_CMPINT_LT) -#define _mm256_mask_cmplt_epi32_mask(k, A, B) \ - _mm256_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_LT) -#define _mm256_cmpneq_epi32_mask(A, B) \ - _mm256_cmp_epi32_mask((A), (B), _MM_CMPINT_NE) -#define _mm256_mask_cmpneq_epi32_mask(k, A, B) \ - _mm256_mask_cmp_epi32_mask((k), (A), (B), _MM_CMPINT_NE) - -#define _mm_cmpeq_epu32_mask(A, B) \ - _mm_cmp_epu32_mask((A), (B), _MM_CMPINT_EQ) -#define _mm_mask_cmpeq_epu32_mask(k, A, B) \ - _mm_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_EQ) -#define _mm_cmpge_epu32_mask(A, B) \ - _mm_cmp_epu32_mask((A), (B), _MM_CMPINT_GE) -#define _mm_mask_cmpge_epu32_mask(k, A, B) \ - _mm_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GE) -#define _mm_cmpgt_epu32_mask(A, B) \ - _mm_cmp_epu32_mask((A), (B), _MM_CMPINT_GT) -#define _mm_mask_cmpgt_epu32_mask(k, A, B) \ - _mm_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GT) -#define _mm_cmple_epu32_mask(A, B) \ - _mm_cmp_epu32_mask((A), (B), _MM_CMPINT_LE) -#define _mm_mask_cmple_epu32_mask(k, A, B) \ - _mm_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LE) -#define _mm_cmplt_epu32_mask(A, B) \ - _mm_cmp_epu32_mask((A), (B), _MM_CMPINT_LT) -#define _mm_mask_cmplt_epu32_mask(k, A, B) \ - _mm_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LT) -#define _mm_cmpneq_epu32_mask(A, B) \ - _mm_cmp_epu32_mask((A), (B), _MM_CMPINT_NE) -#define _mm_mask_cmpneq_epu32_mask(k, A, B) \ - _mm_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_NE) - -#define _mm256_cmpeq_epu32_mask(A, B) \ - _mm256_cmp_epu32_mask((A), (B), _MM_CMPINT_EQ) -#define _mm256_mask_cmpeq_epu32_mask(k, A, B) \ - _mm256_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_EQ) -#define _mm256_cmpge_epu32_mask(A, B) \ - _mm256_cmp_epu32_mask((A), (B), _MM_CMPINT_GE) -#define _mm256_mask_cmpge_epu32_mask(k, A, B) \ - _mm256_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GE) -#define _mm256_cmpgt_epu32_mask(A, B) \ - _mm256_cmp_epu32_mask((A), (B), _MM_CMPINT_GT) -#define _mm256_mask_cmpgt_epu32_mask(k, A, B) \ - _mm256_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_GT) -#define _mm256_cmple_epu32_mask(A, B) \ - _mm256_cmp_epu32_mask((A), (B), _MM_CMPINT_LE) -#define _mm256_mask_cmple_epu32_mask(k, A, B) \ - _mm256_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LE) -#define _mm256_cmplt_epu32_mask(A, B) \ - _mm256_cmp_epu32_mask((A), (B), _MM_CMPINT_LT) -#define _mm256_mask_cmplt_epu32_mask(k, A, B) \ - _mm256_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_LT) -#define _mm256_cmpneq_epu32_mask(A, B) \ - _mm256_cmp_epu32_mask((A), (B), _MM_CMPINT_NE) -#define _mm256_mask_cmpneq_epu32_mask(k, A, B) \ - _mm256_mask_cmp_epu32_mask((k), (A), (B), _MM_CMPINT_NE) - -#define _mm_cmpeq_epi64_mask(A, B) \ - _mm_cmp_epi64_mask((A), (B), _MM_CMPINT_EQ) -#define _mm_mask_cmpeq_epi64_mask(k, A, B) \ - _mm_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_EQ) -#define _mm_cmpge_epi64_mask(A, B) \ - _mm_cmp_epi64_mask((A), (B), _MM_CMPINT_GE) -#define _mm_mask_cmpge_epi64_mask(k, A, B) \ - _mm_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GE) -#define _mm_cmpgt_epi64_mask(A, B) \ - _mm_cmp_epi64_mask((A), (B), _MM_CMPINT_GT) -#define _mm_mask_cmpgt_epi64_mask(k, A, B) \ - _mm_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GT) -#define _mm_cmple_epi64_mask(A, B) \ - _mm_cmp_epi64_mask((A), (B), _MM_CMPINT_LE) -#define _mm_mask_cmple_epi64_mask(k, A, B) \ - _mm_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LE) -#define _mm_cmplt_epi64_mask(A, B) \ - _mm_cmp_epi64_mask((A), (B), _MM_CMPINT_LT) -#define _mm_mask_cmplt_epi64_mask(k, A, B) \ - _mm_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LT) -#define _mm_cmpneq_epi64_mask(A, B) \ - _mm_cmp_epi64_mask((A), (B), _MM_CMPINT_NE) -#define _mm_mask_cmpneq_epi64_mask(k, A, B) \ - _mm_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_NE) - -#define _mm256_cmpeq_epi64_mask(A, B) \ - _mm256_cmp_epi64_mask((A), (B), _MM_CMPINT_EQ) -#define _mm256_mask_cmpeq_epi64_mask(k, A, B) \ - _mm256_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_EQ) -#define _mm256_cmpge_epi64_mask(A, B) \ - _mm256_cmp_epi64_mask((A), (B), _MM_CMPINT_GE) -#define _mm256_mask_cmpge_epi64_mask(k, A, B) \ - _mm256_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GE) -#define _mm256_cmpgt_epi64_mask(A, B) \ - _mm256_cmp_epi64_mask((A), (B), _MM_CMPINT_GT) -#define _mm256_mask_cmpgt_epi64_mask(k, A, B) \ - _mm256_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_GT) -#define _mm256_cmple_epi64_mask(A, B) \ - _mm256_cmp_epi64_mask((A), (B), _MM_CMPINT_LE) -#define _mm256_mask_cmple_epi64_mask(k, A, B) \ - _mm256_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LE) -#define _mm256_cmplt_epi64_mask(A, B) \ - _mm256_cmp_epi64_mask((A), (B), _MM_CMPINT_LT) -#define _mm256_mask_cmplt_epi64_mask(k, A, B) \ - _mm256_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_LT) -#define _mm256_cmpneq_epi64_mask(A, B) \ - _mm256_cmp_epi64_mask((A), (B), _MM_CMPINT_NE) -#define _mm256_mask_cmpneq_epi64_mask(k, A, B) \ - _mm256_mask_cmp_epi64_mask((k), (A), (B), _MM_CMPINT_NE) - -#define _mm_cmpeq_epu64_mask(A, B) \ - _mm_cmp_epu64_mask((A), (B), _MM_CMPINT_EQ) -#define _mm_mask_cmpeq_epu64_mask(k, A, B) \ - _mm_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_EQ) -#define _mm_cmpge_epu64_mask(A, B) \ - _mm_cmp_epu64_mask((A), (B), _MM_CMPINT_GE) -#define _mm_mask_cmpge_epu64_mask(k, A, B) \ - _mm_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GE) -#define _mm_cmpgt_epu64_mask(A, B) \ - _mm_cmp_epu64_mask((A), (B), _MM_CMPINT_GT) -#define _mm_mask_cmpgt_epu64_mask(k, A, B) \ - _mm_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GT) -#define _mm_cmple_epu64_mask(A, B) \ - _mm_cmp_epu64_mask((A), (B), _MM_CMPINT_LE) -#define _mm_mask_cmple_epu64_mask(k, A, B) \ - _mm_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LE) -#define _mm_cmplt_epu64_mask(A, B) \ - _mm_cmp_epu64_mask((A), (B), _MM_CMPINT_LT) -#define _mm_mask_cmplt_epu64_mask(k, A, B) \ - _mm_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LT) -#define _mm_cmpneq_epu64_mask(A, B) \ - _mm_cmp_epu64_mask((A), (B), _MM_CMPINT_NE) -#define _mm_mask_cmpneq_epu64_mask(k, A, B) \ - _mm_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_NE) - -#define _mm256_cmpeq_epu64_mask(A, B) \ - _mm256_cmp_epu64_mask((A), (B), _MM_CMPINT_EQ) -#define _mm256_mask_cmpeq_epu64_mask(k, A, B) \ - _mm256_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_EQ) -#define _mm256_cmpge_epu64_mask(A, B) \ - _mm256_cmp_epu64_mask((A), (B), _MM_CMPINT_GE) -#define _mm256_mask_cmpge_epu64_mask(k, A, B) \ - _mm256_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GE) -#define _mm256_cmpgt_epu64_mask(A, B) \ - _mm256_cmp_epu64_mask((A), (B), _MM_CMPINT_GT) -#define _mm256_mask_cmpgt_epu64_mask(k, A, B) \ - _mm256_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_GT) -#define _mm256_cmple_epu64_mask(A, B) \ - _mm256_cmp_epu64_mask((A), (B), _MM_CMPINT_LE) -#define _mm256_mask_cmple_epu64_mask(k, A, B) \ - _mm256_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LE) -#define _mm256_cmplt_epu64_mask(A, B) \ - _mm256_cmp_epu64_mask((A), (B), _MM_CMPINT_LT) -#define _mm256_mask_cmplt_epu64_mask(k, A, B) \ - _mm256_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_LT) -#define _mm256_cmpneq_epu64_mask(A, B) \ - _mm256_cmp_epu64_mask((A), (B), _MM_CMPINT_NE) -#define _mm256_mask_cmpneq_epu64_mask(k, A, B) \ - _mm256_mask_cmp_epu64_mask((k), (A), (B), _MM_CMPINT_NE) - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_add_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_add_epi32(__A, __B), - (__v8si)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_add_epi32(__mmask8 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_add_epi32(__A, __B), - (__v8si)_mm256_setzero_si256()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_add_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_add_epi64(__A, __B), - (__v4di)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_add_epi64(__mmask8 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_add_epi64(__A, __B), - (__v4di)_mm256_setzero_si256()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_sub_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_sub_epi32(__A, __B), - (__v8si)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_sub_epi32(__mmask8 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_sub_epi32(__A, __B), - (__v8si)_mm256_setzero_si256()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_sub_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_sub_epi64(__A, __B), - (__v4di)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_sub_epi64(__mmask8 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_sub_epi64(__A, __B), - (__v4di)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_add_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_add_epi32(__A, __B), - (__v4si)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_add_epi32(__mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_add_epi32(__A, __B), - (__v4si)_mm_setzero_si128()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_add_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_add_epi64(__A, __B), - (__v2di)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_add_epi64(__mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_add_epi64(__A, __B), - (__v2di)_mm_setzero_si128()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_sub_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_sub_epi32(__A, __B), - (__v4si)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_sub_epi32(__mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_sub_epi32(__A, __B), - (__v4si)_mm_setzero_si128()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_sub_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_sub_epi64(__A, __B), - (__v2di)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_sub_epi64(__mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_sub_epi64(__A, __B), - (__v2di)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_mul_epi32(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) -{ - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, - (__v4di)_mm256_mul_epi32(__X, __Y), - (__v4di)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_mul_epi32(__mmask8 __M, __m256i __X, __m256i __Y) -{ - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, - (__v4di)_mm256_mul_epi32(__X, __Y), - (__v4di)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_mul_epi32(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) -{ - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, - (__v2di)_mm_mul_epi32(__X, __Y), - (__v2di)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_mul_epi32(__mmask8 __M, __m128i __X, __m128i __Y) -{ - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, - (__v2di)_mm_mul_epi32(__X, __Y), - (__v2di)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_mul_epu32(__m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) -{ - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, - (__v4di)_mm256_mul_epu32(__X, __Y), - (__v4di)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_mul_epu32(__mmask8 __M, __m256i __X, __m256i __Y) -{ - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, - (__v4di)_mm256_mul_epu32(__X, __Y), - (__v4di)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_mul_epu32(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) -{ - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, - (__v2di)_mm_mul_epu32(__X, __Y), - (__v2di)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_mul_epu32(__mmask8 __M, __m128i __X, __m128i __Y) -{ - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, - (__v2di)_mm_mul_epu32(__X, __Y), - (__v2di)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_mullo_epi32(__mmask8 __M, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, - (__v8si)_mm256_mullo_epi32(__A, __B), - (__v8si)_mm256_setzero_si256()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_mullo_epi32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, - (__v8si)_mm256_mullo_epi32(__A, __B), - (__v8si)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_mullo_epi32(__mmask8 __M, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, - (__v4si)_mm_mullo_epi32(__A, __B), - (__v4si)_mm_setzero_si128()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_mullo_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, - (__v4si)_mm_mullo_epi32(__A, __B), - (__v4si)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_and_epi32(__m256i __a, __m256i __b) -{ - return (__m256i)((__v8su)__a & (__v8su)__b); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_and_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_and_epi32(__A, __B), - (__v8si)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_and_epi32(__mmask8 __U, __m256i __A, __m256i __B) -{ - return (__m256i)_mm256_mask_and_epi32(_mm256_setzero_si256(), __U, __A, __B); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_and_epi32(__m128i __a, __m128i __b) -{ - return (__m128i)((__v4su)__a & (__v4su)__b); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_and_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_and_epi32(__A, __B), - (__v4si)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_and_epi32(__mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)_mm_mask_and_epi32(_mm_setzero_si128(), __U, __A, __B); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_andnot_epi32(__m256i __A, __m256i __B) -{ - return (__m256i)(~(__v8su)__A & (__v8su)__B); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_andnot_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_andnot_epi32(__A, __B), - (__v8si)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_andnot_epi32(__mmask8 __U, __m256i __A, __m256i __B) -{ - return (__m256i)_mm256_mask_andnot_epi32(_mm256_setzero_si256(), - __U, __A, __B); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_andnot_epi32(__m128i __A, __m128i __B) -{ - return (__m128i)(~(__v4su)__A & (__v4su)__B); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_andnot_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_andnot_epi32(__A, __B), - (__v4si)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_andnot_epi32(__mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)_mm_mask_andnot_epi32(_mm_setzero_si128(), __U, __A, __B); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_or_epi32(__m256i __a, __m256i __b) -{ - return (__m256i)((__v8su)__a | (__v8su)__b); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_or_epi32 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_or_epi32(__A, __B), - (__v8si)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_or_epi32(__mmask8 __U, __m256i __A, __m256i __B) -{ - return (__m256i)_mm256_mask_or_epi32(_mm256_setzero_si256(), __U, __A, __B); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_or_epi32(__m128i __a, __m128i __b) -{ - return (__m128i)((__v4su)__a | (__v4su)__b); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_or_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_or_epi32(__A, __B), - (__v4si)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_or_epi32(__mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)_mm_mask_or_epi32(_mm_setzero_si128(), __U, __A, __B); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_xor_epi32(__m256i __a, __m256i __b) -{ - return (__m256i)((__v8su)__a ^ (__v8su)__b); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_xor_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_xor_epi32(__A, __B), - (__v8si)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_xor_epi32(__mmask8 __U, __m256i __A, __m256i __B) -{ - return (__m256i)_mm256_mask_xor_epi32(_mm256_setzero_si256(), __U, __A, __B); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_xor_epi32(__m128i __a, __m128i __b) -{ - return (__m128i)((__v4su)__a ^ (__v4su)__b); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_xor_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_xor_epi32(__A, __B), - (__v4si)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_xor_epi32(__mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)_mm_mask_xor_epi32(_mm_setzero_si128(), __U, __A, __B); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_and_epi64(__m256i __a, __m256i __b) -{ - return (__m256i)((__v4du)__a & (__v4du)__b); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_and_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_and_epi64(__A, __B), - (__v4di)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_and_epi64(__mmask8 __U, __m256i __A, __m256i __B) -{ - return (__m256i)_mm256_mask_and_epi64(_mm256_setzero_si256(), __U, __A, __B); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_and_epi64(__m128i __a, __m128i __b) -{ - return (__m128i)((__v2du)__a & (__v2du)__b); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_and_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_and_epi64(__A, __B), - (__v2di)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_and_epi64(__mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)_mm_mask_and_epi64(_mm_setzero_si128(), __U, __A, __B); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_andnot_epi64(__m256i __A, __m256i __B) -{ - return (__m256i)(~(__v4du)__A & (__v4du)__B); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_andnot_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_andnot_epi64(__A, __B), - (__v4di)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_andnot_epi64(__mmask8 __U, __m256i __A, __m256i __B) -{ - return (__m256i)_mm256_mask_andnot_epi64(_mm256_setzero_si256(), - __U, __A, __B); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_andnot_epi64(__m128i __A, __m128i __B) -{ - return (__m128i)(~(__v2du)__A & (__v2du)__B); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_andnot_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_andnot_epi64(__A, __B), - (__v2di)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_andnot_epi64(__mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)_mm_mask_andnot_epi64(_mm_setzero_si128(), __U, __A, __B); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_or_epi64(__m256i __a, __m256i __b) -{ - return (__m256i)((__v4du)__a | (__v4du)__b); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_or_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_or_epi64(__A, __B), - (__v4di)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_or_epi64(__mmask8 __U, __m256i __A, __m256i __B) -{ - return (__m256i)_mm256_mask_or_epi64(_mm256_setzero_si256(), __U, __A, __B); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_or_epi64(__m128i __a, __m128i __b) -{ - return (__m128i)((__v2du)__a | (__v2du)__b); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_or_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_or_epi64(__A, __B), - (__v2di)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_or_epi64(__mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)_mm_mask_or_epi64(_mm_setzero_si128(), __U, __A, __B); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_xor_epi64(__m256i __a, __m256i __b) -{ - return (__m256i)((__v4du)__a ^ (__v4du)__b); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_xor_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_xor_epi64(__A, __B), - (__v4di)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_xor_epi64(__mmask8 __U, __m256i __A, __m256i __B) -{ - return (__m256i)_mm256_mask_xor_epi64(_mm256_setzero_si256(), __U, __A, __B); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_xor_epi64(__m128i __a, __m128i __b) -{ - return (__m128i)((__v2du)__a ^ (__v2du)__b); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_xor_epi64(__m128i __W, __mmask8 __U, __m128i __A, - __m128i __B) -{ - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_xor_epi64(__A, __B), - (__v2di)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_xor_epi64(__mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)_mm_mask_xor_epi64(_mm_setzero_si128(), __U, __A, __B); -} - -#define _mm_cmp_epi32_mask(a, b, p) \ - ((__mmask8)__builtin_ia32_cmpd128_mask((__v4si)(__m128i)(a), \ - (__v4si)(__m128i)(b), (int)(p), \ - (__mmask8)-1)) - -#define _mm_mask_cmp_epi32_mask(m, a, b, p) \ - ((__mmask8)__builtin_ia32_cmpd128_mask((__v4si)(__m128i)(a), \ - (__v4si)(__m128i)(b), (int)(p), \ - (__mmask8)(m))) - -#define _mm_cmp_epu32_mask(a, b, p) \ - ((__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)(__m128i)(a), \ - (__v4si)(__m128i)(b), (int)(p), \ - (__mmask8)-1)) - -#define _mm_mask_cmp_epu32_mask(m, a, b, p) \ - ((__mmask8)__builtin_ia32_ucmpd128_mask((__v4si)(__m128i)(a), \ - (__v4si)(__m128i)(b), (int)(p), \ - (__mmask8)(m))) - -#define _mm256_cmp_epi32_mask(a, b, p) \ - ((__mmask8)__builtin_ia32_cmpd256_mask((__v8si)(__m256i)(a), \ - (__v8si)(__m256i)(b), (int)(p), \ - (__mmask8)-1)) - -#define _mm256_mask_cmp_epi32_mask(m, a, b, p) \ - ((__mmask8)__builtin_ia32_cmpd256_mask((__v8si)(__m256i)(a), \ - (__v8si)(__m256i)(b), (int)(p), \ - (__mmask8)(m))) - -#define _mm256_cmp_epu32_mask(a, b, p) \ - ((__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)(__m256i)(a), \ - (__v8si)(__m256i)(b), (int)(p), \ - (__mmask8)-1)) - -#define _mm256_mask_cmp_epu32_mask(m, a, b, p) \ - ((__mmask8)__builtin_ia32_ucmpd256_mask((__v8si)(__m256i)(a), \ - (__v8si)(__m256i)(b), (int)(p), \ - (__mmask8)(m))) - -#define _mm_cmp_epi64_mask(a, b, p) \ - ((__mmask8)__builtin_ia32_cmpq128_mask((__v2di)(__m128i)(a), \ - (__v2di)(__m128i)(b), (int)(p), \ - (__mmask8)-1)) - -#define _mm_mask_cmp_epi64_mask(m, a, b, p) \ - ((__mmask8)__builtin_ia32_cmpq128_mask((__v2di)(__m128i)(a), \ - (__v2di)(__m128i)(b), (int)(p), \ - (__mmask8)(m))) - -#define _mm_cmp_epu64_mask(a, b, p) \ - ((__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)(__m128i)(a), \ - (__v2di)(__m128i)(b), (int)(p), \ - (__mmask8)-1)) - -#define _mm_mask_cmp_epu64_mask(m, a, b, p) \ - ((__mmask8)__builtin_ia32_ucmpq128_mask((__v2di)(__m128i)(a), \ - (__v2di)(__m128i)(b), (int)(p), \ - (__mmask8)(m))) - -#define _mm256_cmp_epi64_mask(a, b, p) \ - ((__mmask8)__builtin_ia32_cmpq256_mask((__v4di)(__m256i)(a), \ - (__v4di)(__m256i)(b), (int)(p), \ - (__mmask8)-1)) - -#define _mm256_mask_cmp_epi64_mask(m, a, b, p) \ - ((__mmask8)__builtin_ia32_cmpq256_mask((__v4di)(__m256i)(a), \ - (__v4di)(__m256i)(b), (int)(p), \ - (__mmask8)(m))) - -#define _mm256_cmp_epu64_mask(a, b, p) \ - ((__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)(__m256i)(a), \ - (__v4di)(__m256i)(b), (int)(p), \ - (__mmask8)-1)) - -#define _mm256_mask_cmp_epu64_mask(m, a, b, p) \ - ((__mmask8)__builtin_ia32_ucmpq256_mask((__v4di)(__m256i)(a), \ - (__v4di)(__m256i)(b), (int)(p), \ - (__mmask8)(m))) - -#define _mm256_cmp_ps_mask(a, b, p) \ - ((__mmask8)__builtin_ia32_cmpps256_mask((__v8sf)(__m256)(a), \ - (__v8sf)(__m256)(b), (int)(p), \ - (__mmask8)-1)) - -#define _mm256_mask_cmp_ps_mask(m, a, b, p) \ - ((__mmask8)__builtin_ia32_cmpps256_mask((__v8sf)(__m256)(a), \ - (__v8sf)(__m256)(b), (int)(p), \ - (__mmask8)(m))) - -#define _mm256_cmp_pd_mask(a, b, p) \ - ((__mmask8)__builtin_ia32_cmppd256_mask((__v4df)(__m256d)(a), \ - (__v4df)(__m256d)(b), (int)(p), \ - (__mmask8)-1)) - -#define _mm256_mask_cmp_pd_mask(m, a, b, p) \ - ((__mmask8)__builtin_ia32_cmppd256_mask((__v4df)(__m256d)(a), \ - (__v4df)(__m256d)(b), (int)(p), \ - (__mmask8)(m))) - -#define _mm_cmp_ps_mask(a, b, p) \ - ((__mmask8)__builtin_ia32_cmpps128_mask((__v4sf)(__m128)(a), \ - (__v4sf)(__m128)(b), (int)(p), \ - (__mmask8)-1)) - -#define _mm_mask_cmp_ps_mask(m, a, b, p) \ - ((__mmask8)__builtin_ia32_cmpps128_mask((__v4sf)(__m128)(a), \ - (__v4sf)(__m128)(b), (int)(p), \ - (__mmask8)(m))) - -#define _mm_cmp_pd_mask(a, b, p) \ - ((__mmask8)__builtin_ia32_cmppd128_mask((__v2df)(__m128d)(a), \ - (__v2df)(__m128d)(b), (int)(p), \ - (__mmask8)-1)) - -#define _mm_mask_cmp_pd_mask(m, a, b, p) \ - ((__mmask8)__builtin_ia32_cmppd128_mask((__v2df)(__m128d)(a), \ - (__v2df)(__m128d)(b), (int)(p), \ - (__mmask8)(m))) - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_fmadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) -{ - return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, - __builtin_ia32_vfmaddpd ((__v2df) __A, - (__v2df) __B, - (__v2df) __C), - (__v2df) __A); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask3_fmadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) -{ - return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, - __builtin_ia32_vfmaddpd ((__v2df) __A, - (__v2df) __B, - (__v2df) __C), - (__v2df) __C); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_fmadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) -{ - return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, - __builtin_ia32_vfmaddpd ((__v2df) __A, - (__v2df) __B, - (__v2df) __C), - (__v2df)_mm_setzero_pd()); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_fmsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) -{ - return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, - __builtin_ia32_vfmaddpd ((__v2df) __A, - (__v2df) __B, - -(__v2df) __C), - (__v2df) __A); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_fmsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) -{ - return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, - __builtin_ia32_vfmaddpd ((__v2df) __A, - (__v2df) __B, - -(__v2df) __C), - (__v2df)_mm_setzero_pd()); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask3_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) -{ - return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, - __builtin_ia32_vfmaddpd (-(__v2df) __A, - (__v2df) __B, - (__v2df) __C), - (__v2df) __C); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_fnmadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) -{ - return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, - __builtin_ia32_vfmaddpd (-(__v2df) __A, - (__v2df) __B, - (__v2df) __C), - (__v2df)_mm_setzero_pd()); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_fnmsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) -{ - return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, - __builtin_ia32_vfmaddpd (-(__v2df) __A, - (__v2df) __B, - -(__v2df) __C), - (__v2df)_mm_setzero_pd()); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_mask_fmadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) -{ - return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, - __builtin_ia32_vfmaddpd256 ((__v4df) __A, - (__v4df) __B, - (__v4df) __C), - (__v4df) __A); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_mask3_fmadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) -{ - return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, - __builtin_ia32_vfmaddpd256 ((__v4df) __A, - (__v4df) __B, - (__v4df) __C), - (__v4df) __C); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_maskz_fmadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) -{ - return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, - __builtin_ia32_vfmaddpd256 ((__v4df) __A, - (__v4df) __B, - (__v4df) __C), - (__v4df)_mm256_setzero_pd()); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_mask_fmsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) -{ - return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, - __builtin_ia32_vfmaddpd256 ((__v4df) __A, - (__v4df) __B, - -(__v4df) __C), - (__v4df) __A); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_maskz_fmsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) -{ - return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, - __builtin_ia32_vfmaddpd256 ((__v4df) __A, - (__v4df) __B, - -(__v4df) __C), - (__v4df)_mm256_setzero_pd()); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_mask3_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) -{ - return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, - __builtin_ia32_vfmaddpd256 (-(__v4df) __A, - (__v4df) __B, - (__v4df) __C), - (__v4df) __C); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_maskz_fnmadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) -{ - return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, - __builtin_ia32_vfmaddpd256 (-(__v4df) __A, - (__v4df) __B, - (__v4df) __C), - (__v4df)_mm256_setzero_pd()); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_maskz_fnmsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) -{ - return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, - __builtin_ia32_vfmaddpd256 (-(__v4df) __A, - (__v4df) __B, - -(__v4df) __C), - (__v4df)_mm256_setzero_pd()); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_fmadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) -{ - return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, - __builtin_ia32_vfmaddps ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C), - (__v4sf) __A); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask3_fmadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) -{ - return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, - __builtin_ia32_vfmaddps ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C), - (__v4sf) __C); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_fmadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) -{ - return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, - __builtin_ia32_vfmaddps ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C), - (__v4sf)_mm_setzero_ps()); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_fmsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) -{ - return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, - __builtin_ia32_vfmaddps ((__v4sf) __A, - (__v4sf) __B, - -(__v4sf) __C), - (__v4sf) __A); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_fmsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) -{ - return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, - __builtin_ia32_vfmaddps ((__v4sf) __A, - (__v4sf) __B, - -(__v4sf) __C), - (__v4sf)_mm_setzero_ps()); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask3_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) -{ - return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, - __builtin_ia32_vfmaddps (-(__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C), - (__v4sf) __C); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_fnmadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) -{ - return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, - __builtin_ia32_vfmaddps (-(__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C), - (__v4sf)_mm_setzero_ps()); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_fnmsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) -{ - return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, - __builtin_ia32_vfmaddps (-(__v4sf) __A, - (__v4sf) __B, - -(__v4sf) __C), - (__v4sf)_mm_setzero_ps()); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_mask_fmadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) -{ - return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, - __builtin_ia32_vfmaddps256 ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __C), - (__v8sf) __A); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_mask3_fmadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) -{ - return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, - __builtin_ia32_vfmaddps256 ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __C), - (__v8sf) __C); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_maskz_fmadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C) -{ - return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, - __builtin_ia32_vfmaddps256 ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __C), - (__v8sf)_mm256_setzero_ps()); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_mask_fmsub_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) -{ - return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, - __builtin_ia32_vfmaddps256 ((__v8sf) __A, - (__v8sf) __B, - -(__v8sf) __C), - (__v8sf) __A); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_maskz_fmsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C) -{ - return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, - __builtin_ia32_vfmaddps256 ((__v8sf) __A, - (__v8sf) __B, - -(__v8sf) __C), - (__v8sf)_mm256_setzero_ps()); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_mask3_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) -{ - return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, - __builtin_ia32_vfmaddps256 (-(__v8sf) __A, - (__v8sf) __B, - (__v8sf) __C), - (__v8sf) __C); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_maskz_fnmadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C) -{ - return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, - __builtin_ia32_vfmaddps256 (-(__v8sf) __A, - (__v8sf) __B, - (__v8sf) __C), - (__v8sf)_mm256_setzero_ps()); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_maskz_fnmsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C) -{ - return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, - __builtin_ia32_vfmaddps256 (-(__v8sf) __A, - (__v8sf) __B, - -(__v8sf) __C), - (__v8sf)_mm256_setzero_ps()); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_fmaddsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) -{ - return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, - __builtin_ia32_vfmaddsubpd ((__v2df) __A, - (__v2df) __B, - (__v2df) __C), - (__v2df) __A); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask3_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) -{ - return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, - __builtin_ia32_vfmaddsubpd ((__v2df) __A, - (__v2df) __B, - (__v2df) __C), - (__v2df) __C); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_fmaddsub_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) -{ - return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, - __builtin_ia32_vfmaddsubpd ((__v2df) __A, - (__v2df) __B, - (__v2df) __C), - (__v2df)_mm_setzero_pd()); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_fmsubadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) -{ - return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, - __builtin_ia32_vfmaddsubpd ((__v2df) __A, - (__v2df) __B, - -(__v2df) __C), - (__v2df) __A); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_fmsubadd_pd(__mmask8 __U, __m128d __A, __m128d __B, __m128d __C) -{ - return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, - __builtin_ia32_vfmaddsubpd ((__v2df) __A, - (__v2df) __B, - -(__v2df) __C), - (__v2df)_mm_setzero_pd()); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_mask_fmaddsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) -{ - return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, - __builtin_ia32_vfmaddsubpd256 ((__v4df) __A, - (__v4df) __B, - (__v4df) __C), - (__v4df) __A); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_mask3_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) -{ - return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, - __builtin_ia32_vfmaddsubpd256 ((__v4df) __A, - (__v4df) __B, - (__v4df) __C), - (__v4df) __C); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_maskz_fmaddsub_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) -{ - return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, - __builtin_ia32_vfmaddsubpd256 ((__v4df) __A, - (__v4df) __B, - (__v4df) __C), - (__v4df)_mm256_setzero_pd()); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_mask_fmsubadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) -{ - return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, - __builtin_ia32_vfmaddsubpd256 ((__v4df) __A, - (__v4df) __B, - -(__v4df) __C), - (__v4df) __A); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_maskz_fmsubadd_pd(__mmask8 __U, __m256d __A, __m256d __B, __m256d __C) -{ - return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, - __builtin_ia32_vfmaddsubpd256 ((__v4df) __A, - (__v4df) __B, - -(__v4df) __C), - (__v4df)_mm256_setzero_pd()); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_fmaddsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) -{ - return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, - __builtin_ia32_vfmaddsubps ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C), - (__v4sf) __A); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask3_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) -{ - return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, - __builtin_ia32_vfmaddsubps ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C), - (__v4sf) __C); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_fmaddsub_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) -{ - return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, - __builtin_ia32_vfmaddsubps ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __C), - (__v4sf)_mm_setzero_ps()); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_fmsubadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) -{ - return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, - __builtin_ia32_vfmaddsubps ((__v4sf) __A, - (__v4sf) __B, - -(__v4sf) __C), - (__v4sf) __A); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_fmsubadd_ps(__mmask8 __U, __m128 __A, __m128 __B, __m128 __C) -{ - return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, - __builtin_ia32_vfmaddsubps ((__v4sf) __A, - (__v4sf) __B, - -(__v4sf) __C), - (__v4sf)_mm_setzero_ps()); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_mask_fmaddsub_ps(__m256 __A, __mmask8 __U, __m256 __B, - __m256 __C) -{ - return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, - __builtin_ia32_vfmaddsubps256 ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __C), - (__v8sf) __A); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_mask3_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) -{ - return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, - __builtin_ia32_vfmaddsubps256 ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __C), - (__v8sf) __C); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_maskz_fmaddsub_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C) -{ - return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, - __builtin_ia32_vfmaddsubps256 ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __C), - (__v8sf)_mm256_setzero_ps()); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_mask_fmsubadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) -{ - return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, - __builtin_ia32_vfmaddsubps256 ((__v8sf) __A, - (__v8sf) __B, - -(__v8sf) __C), - (__v8sf) __A); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_maskz_fmsubadd_ps(__mmask8 __U, __m256 __A, __m256 __B, __m256 __C) -{ - return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, - __builtin_ia32_vfmaddsubps256 ((__v8sf) __A, - (__v8sf) __B, - -(__v8sf) __C), - (__v8sf)_mm256_setzero_ps()); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask3_fmsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) -{ - return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, - __builtin_ia32_vfmaddpd ((__v2df) __A, - (__v2df) __B, - -(__v2df) __C), - (__v2df) __C); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_mask3_fmsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) -{ - return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, - __builtin_ia32_vfmaddpd256 ((__v4df) __A, - (__v4df) __B, - -(__v4df) __C), - (__v4df) __C); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask3_fmsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) -{ - return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, - __builtin_ia32_vfmaddps ((__v4sf) __A, - (__v4sf) __B, - -(__v4sf) __C), - (__v4sf) __C); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_mask3_fmsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) -{ - return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, - __builtin_ia32_vfmaddps256 ((__v8sf) __A, - (__v8sf) __B, - -(__v8sf) __C), - (__v8sf) __C); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask3_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) -{ - return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, - __builtin_ia32_vfmaddsubpd ((__v2df) __A, - (__v2df) __B, - -(__v2df) __C), - (__v2df) __C); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_mask3_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) -{ - return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, - __builtin_ia32_vfmaddsubpd256 ((__v4df) __A, - (__v4df) __B, - -(__v4df) __C), - (__v4df) __C); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask3_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) -{ - return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, - __builtin_ia32_vfmaddsubps ((__v4sf) __A, - (__v4sf) __B, - -(__v4sf) __C), - (__v4sf) __C); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_mask3_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) -{ - return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, - __builtin_ia32_vfmaddsubps256 ((__v8sf) __A, - (__v8sf) __B, - -(__v8sf) __C), - (__v8sf) __C); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_fnmadd_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) -{ - return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, - __builtin_ia32_vfmaddpd ((__v2df) __A, - -(__v2df) __B, - (__v2df) __C), - (__v2df) __A); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_mask_fnmadd_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) -{ - return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, - __builtin_ia32_vfmaddpd256 ((__v4df) __A, - -(__v4df) __B, - (__v4df) __C), - (__v4df) __A); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_fnmadd_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) -{ - return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, - __builtin_ia32_vfmaddps ((__v4sf) __A, - -(__v4sf) __B, - (__v4sf) __C), - (__v4sf) __A); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_mask_fnmadd_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) -{ - return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, - __builtin_ia32_vfmaddps256 ((__v8sf) __A, - -(__v8sf) __B, - (__v8sf) __C), - (__v8sf) __A); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_fnmsub_pd(__m128d __A, __mmask8 __U, __m128d __B, __m128d __C) -{ - return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, - __builtin_ia32_vfmaddpd ((__v2df) __A, - -(__v2df) __B, - -(__v2df) __C), - (__v2df) __A); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask3_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C, __mmask8 __U) -{ - return (__m128d) __builtin_ia32_selectpd_128((__mmask8) __U, - __builtin_ia32_vfmaddpd ((__v2df) __A, - -(__v2df) __B, - -(__v2df) __C), - (__v2df) __C); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_mask_fnmsub_pd(__m256d __A, __mmask8 __U, __m256d __B, __m256d __C) -{ - return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, - __builtin_ia32_vfmaddpd256 ((__v4df) __A, - -(__v4df) __B, - -(__v4df) __C), - (__v4df) __A); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_mask3_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C, __mmask8 __U) -{ - return (__m256d) __builtin_ia32_selectpd_256((__mmask8) __U, - __builtin_ia32_vfmaddpd256 ((__v4df) __A, - -(__v4df) __B, - -(__v4df) __C), - (__v4df) __C); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_fnmsub_ps(__m128 __A, __mmask8 __U, __m128 __B, __m128 __C) -{ - return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, - __builtin_ia32_vfmaddps ((__v4sf) __A, - -(__v4sf) __B, - -(__v4sf) __C), - (__v4sf) __A); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask3_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C, __mmask8 __U) -{ - return (__m128) __builtin_ia32_selectps_128((__mmask8) __U, - __builtin_ia32_vfmaddps ((__v4sf) __A, - -(__v4sf) __B, - -(__v4sf) __C), - (__v4sf) __C); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_mask_fnmsub_ps(__m256 __A, __mmask8 __U, __m256 __B, __m256 __C) -{ - return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, - __builtin_ia32_vfmaddps256 ((__v8sf) __A, - -(__v8sf) __B, - -(__v8sf) __C), - (__v8sf) __A); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_mask3_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C, __mmask8 __U) -{ - return (__m256) __builtin_ia32_selectps_256((__mmask8) __U, - __builtin_ia32_vfmaddps256 ((__v8sf) __A, - -(__v8sf) __B, - -(__v8sf) __C), - (__v8sf) __C); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_add_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_add_pd(__A, __B), - (__v2df)__W); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_add_pd(__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_add_pd(__A, __B), - (__v2df)_mm_setzero_pd()); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_mask_add_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_add_pd(__A, __B), - (__v4df)__W); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_maskz_add_pd(__mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_add_pd(__A, __B), - (__v4df)_mm256_setzero_pd()); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_add_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_add_ps(__A, __B), - (__v4sf)__W); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_add_ps(__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_add_ps(__A, __B), - (__v4sf)_mm_setzero_ps()); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_mask_add_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_add_ps(__A, __B), - (__v8sf)__W); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_maskz_add_ps(__mmask8 __U, __m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_add_ps(__A, __B), - (__v8sf)_mm256_setzero_ps()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_blend_epi32 (__mmask8 __U, __m128i __A, __m128i __W) { - return (__m128i) __builtin_ia32_selectd_128 ((__mmask8) __U, - (__v4si) __W, - (__v4si) __A); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_blend_epi32 (__mmask8 __U, __m256i __A, __m256i __W) { - return (__m256i) __builtin_ia32_selectd_256 ((__mmask8) __U, - (__v8si) __W, - (__v8si) __A); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_blend_pd (__mmask8 __U, __m128d __A, __m128d __W) { - return (__m128d) __builtin_ia32_selectpd_128 ((__mmask8) __U, - (__v2df) __W, - (__v2df) __A); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_mask_blend_pd (__mmask8 __U, __m256d __A, __m256d __W) { - return (__m256d) __builtin_ia32_selectpd_256 ((__mmask8) __U, - (__v4df) __W, - (__v4df) __A); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_blend_ps (__mmask8 __U, __m128 __A, __m128 __W) { - return (__m128) __builtin_ia32_selectps_128 ((__mmask8) __U, - (__v4sf) __W, - (__v4sf) __A); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_mask_blend_ps (__mmask8 __U, __m256 __A, __m256 __W) { - return (__m256) __builtin_ia32_selectps_256 ((__mmask8) __U, - (__v8sf) __W, - (__v8sf) __A); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_blend_epi64 (__mmask8 __U, __m128i __A, __m128i __W) { - return (__m128i) __builtin_ia32_selectq_128 ((__mmask8) __U, - (__v2di) __W, - (__v2di) __A); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_blend_epi64 (__mmask8 __U, __m256i __A, __m256i __W) { - return (__m256i) __builtin_ia32_selectq_256 ((__mmask8) __U, - (__v4di) __W, - (__v4di) __A); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_compress_pd (__m128d __W, __mmask8 __U, __m128d __A) { - return (__m128d) __builtin_ia32_compressdf128_mask ((__v2df) __A, - (__v2df) __W, - (__mmask8) __U); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_compress_pd (__mmask8 __U, __m128d __A) { - return (__m128d) __builtin_ia32_compressdf128_mask ((__v2df) __A, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_mask_compress_pd (__m256d __W, __mmask8 __U, __m256d __A) { - return (__m256d) __builtin_ia32_compressdf256_mask ((__v4df) __A, - (__v4df) __W, - (__mmask8) __U); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_maskz_compress_pd (__mmask8 __U, __m256d __A) { - return (__m256d) __builtin_ia32_compressdf256_mask ((__v4df) __A, - (__v4df) - _mm256_setzero_pd (), - (__mmask8) __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_compress_epi64 (__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_compressdi128_mask ((__v2di) __A, - (__v2di) __W, - (__mmask8) __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_compress_epi64 (__mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_compressdi128_mask ((__v2di) __A, - (__v2di) - _mm_setzero_si128 (), - (__mmask8) __U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_compress_epi64 (__m256i __W, __mmask8 __U, __m256i __A) { - return (__m256i) __builtin_ia32_compressdi256_mask ((__v4di) __A, - (__v4di) __W, - (__mmask8) __U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_compress_epi64 (__mmask8 __U, __m256i __A) { - return (__m256i) __builtin_ia32_compressdi256_mask ((__v4di) __A, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) __U); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_compress_ps (__m128 __W, __mmask8 __U, __m128 __A) { - return (__m128) __builtin_ia32_compresssf128_mask ((__v4sf) __A, - (__v4sf) __W, - (__mmask8) __U); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_compress_ps (__mmask8 __U, __m128 __A) { - return (__m128) __builtin_ia32_compresssf128_mask ((__v4sf) __A, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_mask_compress_ps (__m256 __W, __mmask8 __U, __m256 __A) { - return (__m256) __builtin_ia32_compresssf256_mask ((__v8sf) __A, - (__v8sf) __W, - (__mmask8) __U); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_maskz_compress_ps (__mmask8 __U, __m256 __A) { - return (__m256) __builtin_ia32_compresssf256_mask ((__v8sf) __A, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_compress_epi32 (__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_compresssi128_mask ((__v4si) __A, - (__v4si) __W, - (__mmask8) __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_compress_epi32 (__mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_compresssi128_mask ((__v4si) __A, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_compress_epi32 (__m256i __W, __mmask8 __U, __m256i __A) { - return (__m256i) __builtin_ia32_compresssi256_mask ((__v8si) __A, - (__v8si) __W, - (__mmask8) __U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_compress_epi32 (__mmask8 __U, __m256i __A) { - return (__m256i) __builtin_ia32_compresssi256_mask ((__v8si) __A, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) __U); -} - -static __inline__ void __DEFAULT_FN_ATTRS128 -_mm_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m128d __A) { - __builtin_ia32_compressstoredf128_mask ((__v2df *) __P, - (__v2df) __A, - (__mmask8) __U); -} - -static __inline__ void __DEFAULT_FN_ATTRS256 -_mm256_mask_compressstoreu_pd (void *__P, __mmask8 __U, __m256d __A) { - __builtin_ia32_compressstoredf256_mask ((__v4df *) __P, - (__v4df) __A, - (__mmask8) __U); -} - -static __inline__ void __DEFAULT_FN_ATTRS128 -_mm_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m128i __A) { - __builtin_ia32_compressstoredi128_mask ((__v2di *) __P, - (__v2di) __A, - (__mmask8) __U); -} - -static __inline__ void __DEFAULT_FN_ATTRS256 -_mm256_mask_compressstoreu_epi64 (void *__P, __mmask8 __U, __m256i __A) { - __builtin_ia32_compressstoredi256_mask ((__v4di *) __P, - (__v4di) __A, - (__mmask8) __U); -} - -static __inline__ void __DEFAULT_FN_ATTRS128 -_mm_mask_compressstoreu_ps (void *__P, __mmask8 __U, __m128 __A) { - __builtin_ia32_compressstoresf128_mask ((__v4sf *) __P, - (__v4sf) __A, - (__mmask8) __U); -} - -static __inline__ void __DEFAULT_FN_ATTRS256 -_mm256_mask_compressstoreu_ps (void *__P, __mmask8 __U, __m256 __A) { - __builtin_ia32_compressstoresf256_mask ((__v8sf *) __P, - (__v8sf) __A, - (__mmask8) __U); -} - -static __inline__ void __DEFAULT_FN_ATTRS128 -_mm_mask_compressstoreu_epi32 (void *__P, __mmask8 __U, __m128i __A) { - __builtin_ia32_compressstoresi128_mask ((__v4si *) __P, - (__v4si) __A, - (__mmask8) __U); -} - -static __inline__ void __DEFAULT_FN_ATTRS256 -_mm256_mask_compressstoreu_epi32 (void *__P, __mmask8 __U, __m256i __A) { - __builtin_ia32_compressstoresi256_mask ((__v8si *) __P, - (__v8si) __A, - (__mmask8) __U); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_cvtepi32_pd (__m128d __W, __mmask8 __U, __m128i __A) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8) __U, - (__v2df)_mm_cvtepi32_pd(__A), - (__v2df)__W); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtepi32_pd (__mmask8 __U, __m128i __A) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8) __U, - (__v2df)_mm_cvtepi32_pd(__A), - (__v2df)_mm_setzero_pd()); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtepi32_pd (__m256d __W, __mmask8 __U, __m128i __A) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8) __U, - (__v4df)_mm256_cvtepi32_pd(__A), - (__v4df)__W); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtepi32_pd (__mmask8 __U, __m128i __A) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8) __U, - (__v4df)_mm256_cvtepi32_pd(__A), - (__v4df)_mm256_setzero_pd()); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_cvtepi32_ps (__m128 __W, __mmask8 __U, __m128i __A) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_cvtepi32_ps(__A), - (__v4sf)__W); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtepi32_ps (__mmask8 __U, __m128i __A) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_cvtepi32_ps(__A), - (__v4sf)_mm_setzero_ps()); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtepi32_ps (__m256 __W, __mmask8 __U, __m256i __A) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_cvtepi32_ps(__A), - (__v8sf)__W); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtepi32_ps (__mmask8 __U, __m256i __A) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_cvtepi32_ps(__A), - (__v8sf)_mm256_setzero_ps()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvtpd_epi32 (__m128i __W, __mmask8 __U, __m128d __A) { - return (__m128i) __builtin_ia32_cvtpd2dq128_mask ((__v2df) __A, - (__v4si) __W, - (__mmask8) __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtpd_epi32 (__mmask8 __U, __m128d __A) { - return (__m128i) __builtin_ia32_cvtpd2dq128_mask ((__v2df) __A, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtpd_epi32 (__m128i __W, __mmask8 __U, __m256d __A) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm256_cvtpd_epi32(__A), - (__v4si)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtpd_epi32 (__mmask8 __U, __m256d __A) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm256_cvtpd_epi32(__A), - (__v4si)_mm_setzero_si128()); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_cvtpd_ps (__m128 __W, __mmask8 __U, __m128d __A) { - return (__m128) __builtin_ia32_cvtpd2ps_mask ((__v2df) __A, - (__v4sf) __W, - (__mmask8) __U); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtpd_ps (__mmask8 __U, __m128d __A) { - return (__m128) __builtin_ia32_cvtpd2ps_mask ((__v2df) __A, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtpd_ps (__m128 __W, __mmask8 __U, __m256d __A) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm256_cvtpd_ps(__A), - (__v4sf)__W); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtpd_ps (__mmask8 __U, __m256d __A) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm256_cvtpd_ps(__A), - (__v4sf)_mm_setzero_ps()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_cvtpd_epu32 (__m128d __A) { - return (__m128i) __builtin_ia32_cvtpd2udq128_mask ((__v2df) __A, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) -1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvtpd_epu32 (__m128i __W, __mmask8 __U, __m128d __A) { - return (__m128i) __builtin_ia32_cvtpd2udq128_mask ((__v2df) __A, - (__v4si) __W, - (__mmask8) __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtpd_epu32 (__mmask8 __U, __m128d __A) { - return (__m128i) __builtin_ia32_cvtpd2udq128_mask ((__v2df) __A, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_cvtpd_epu32 (__m256d __A) { - return (__m128i) __builtin_ia32_cvtpd2udq256_mask ((__v4df) __A, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) -1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtpd_epu32 (__m128i __W, __mmask8 __U, __m256d __A) { - return (__m128i) __builtin_ia32_cvtpd2udq256_mask ((__v4df) __A, - (__v4si) __W, - (__mmask8) __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtpd_epu32 (__mmask8 __U, __m256d __A) { - return (__m128i) __builtin_ia32_cvtpd2udq256_mask ((__v4df) __A, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvtps_epi32 (__m128i __W, __mmask8 __U, __m128 __A) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_cvtps_epi32(__A), - (__v4si)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtps_epi32 (__mmask8 __U, __m128 __A) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_cvtps_epi32(__A), - (__v4si)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtps_epi32 (__m256i __W, __mmask8 __U, __m256 __A) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_cvtps_epi32(__A), - (__v8si)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtps_epi32 (__mmask8 __U, __m256 __A) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_cvtps_epi32(__A), - (__v8si)_mm256_setzero_si256()); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_cvtps_pd (__m128d __W, __mmask8 __U, __m128 __A) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_cvtps_pd(__A), - (__v2df)__W); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtps_pd (__mmask8 __U, __m128 __A) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_cvtps_pd(__A), - (__v2df)_mm_setzero_pd()); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtps_pd (__m256d __W, __mmask8 __U, __m128 __A) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_cvtps_pd(__A), - (__v4df)__W); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtps_pd (__mmask8 __U, __m128 __A) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_cvtps_pd(__A), - (__v4df)_mm256_setzero_pd()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_cvtps_epu32 (__m128 __A) { - return (__m128i) __builtin_ia32_cvtps2udq128_mask ((__v4sf) __A, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) -1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvtps_epu32 (__m128i __W, __mmask8 __U, __m128 __A) { - return (__m128i) __builtin_ia32_cvtps2udq128_mask ((__v4sf) __A, - (__v4si) __W, - (__mmask8) __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtps_epu32 (__mmask8 __U, __m128 __A) { - return (__m128i) __builtin_ia32_cvtps2udq128_mask ((__v4sf) __A, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_cvtps_epu32 (__m256 __A) { - return (__m256i) __builtin_ia32_cvtps2udq256_mask ((__v8sf) __A, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) -1); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtps_epu32 (__m256i __W, __mmask8 __U, __m256 __A) { - return (__m256i) __builtin_ia32_cvtps2udq256_mask ((__v8sf) __A, - (__v8si) __W, - (__mmask8) __U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtps_epu32 (__mmask8 __U, __m256 __A) { - return (__m256i) __builtin_ia32_cvtps2udq256_mask ((__v8sf) __A, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvttpd_epi32 (__m128i __W, __mmask8 __U, __m128d __A) { - return (__m128i) __builtin_ia32_cvttpd2dq128_mask ((__v2df) __A, - (__v4si) __W, - (__mmask8) __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvttpd_epi32 (__mmask8 __U, __m128d __A) { - return (__m128i) __builtin_ia32_cvttpd2dq128_mask ((__v2df) __A, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_mask_cvttpd_epi32 (__m128i __W, __mmask8 __U, __m256d __A) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm256_cvttpd_epi32(__A), - (__v4si)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvttpd_epi32 (__mmask8 __U, __m256d __A) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm256_cvttpd_epi32(__A), - (__v4si)_mm_setzero_si128()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_cvttpd_epu32 (__m128d __A) { - return (__m128i) __builtin_ia32_cvttpd2udq128_mask ((__v2df) __A, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) -1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvttpd_epu32 (__m128i __W, __mmask8 __U, __m128d __A) { - return (__m128i) __builtin_ia32_cvttpd2udq128_mask ((__v2df) __A, - (__v4si) __W, - (__mmask8) __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvttpd_epu32 (__mmask8 __U, __m128d __A) { - return (__m128i) __builtin_ia32_cvttpd2udq128_mask ((__v2df) __A, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_cvttpd_epu32 (__m256d __A) { - return (__m128i) __builtin_ia32_cvttpd2udq256_mask ((__v4df) __A, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) -1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_mask_cvttpd_epu32 (__m128i __W, __mmask8 __U, __m256d __A) { - return (__m128i) __builtin_ia32_cvttpd2udq256_mask ((__v4df) __A, - (__v4si) __W, - (__mmask8) __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvttpd_epu32 (__mmask8 __U, __m256d __A) { - return (__m128i) __builtin_ia32_cvttpd2udq256_mask ((__v4df) __A, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvttps_epi32 (__m128i __W, __mmask8 __U, __m128 __A) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_cvttps_epi32(__A), - (__v4si)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvttps_epi32 (__mmask8 __U, __m128 __A) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_cvttps_epi32(__A), - (__v4si)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_cvttps_epi32 (__m256i __W, __mmask8 __U, __m256 __A) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_cvttps_epi32(__A), - (__v8si)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvttps_epi32 (__mmask8 __U, __m256 __A) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_cvttps_epi32(__A), - (__v8si)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_cvttps_epu32 (__m128 __A) { - return (__m128i) __builtin_ia32_cvttps2udq128_mask ((__v4sf) __A, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) -1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvttps_epu32 (__m128i __W, __mmask8 __U, __m128 __A) { - return (__m128i) __builtin_ia32_cvttps2udq128_mask ((__v4sf) __A, - (__v4si) __W, - (__mmask8) __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvttps_epu32 (__mmask8 __U, __m128 __A) { - return (__m128i) __builtin_ia32_cvttps2udq128_mask ((__v4sf) __A, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_cvttps_epu32 (__m256 __A) { - return (__m256i) __builtin_ia32_cvttps2udq256_mask ((__v8sf) __A, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) -1); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_cvttps_epu32 (__m256i __W, __mmask8 __U, __m256 __A) { - return (__m256i) __builtin_ia32_cvttps2udq256_mask ((__v8sf) __A, - (__v8si) __W, - (__mmask8) __U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvttps_epu32 (__mmask8 __U, __m256 __A) { - return (__m256i) __builtin_ia32_cvttps2udq256_mask ((__v8sf) __A, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) __U); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_cvtepu32_pd (__m128i __A) { - return (__m128d) __builtin_convertvector( - __builtin_shufflevector((__v4su)__A, (__v4su)__A, 0, 1), __v2df); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_cvtepu32_pd (__m128d __W, __mmask8 __U, __m128i __A) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8) __U, - (__v2df)_mm_cvtepu32_pd(__A), - (__v2df)__W); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtepu32_pd (__mmask8 __U, __m128i __A) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8) __U, - (__v2df)_mm_cvtepu32_pd(__A), - (__v2df)_mm_setzero_pd()); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_cvtepu32_pd (__m128i __A) { - return (__m256d)__builtin_convertvector((__v4su)__A, __v4df); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtepu32_pd (__m256d __W, __mmask8 __U, __m128i __A) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8) __U, - (__v4df)_mm256_cvtepu32_pd(__A), - (__v4df)__W); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtepu32_pd (__mmask8 __U, __m128i __A) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8) __U, - (__v4df)_mm256_cvtepu32_pd(__A), - (__v4df)_mm256_setzero_pd()); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_cvtepu32_ps (__m128i __A) { - return (__m128)__builtin_convertvector((__v4su)__A, __v4sf); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_cvtepu32_ps (__m128 __W, __mmask8 __U, __m128i __A) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_cvtepu32_ps(__A), - (__v4sf)__W); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtepu32_ps (__mmask8 __U, __m128i __A) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_cvtepu32_ps(__A), - (__v4sf)_mm_setzero_ps()); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_cvtepu32_ps (__m256i __A) { - return (__m256)__builtin_convertvector((__v8su)__A, __v8sf); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtepu32_ps (__m256 __W, __mmask8 __U, __m256i __A) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_cvtepu32_ps(__A), - (__v8sf)__W); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtepu32_ps (__mmask8 __U, __m256i __A) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_cvtepu32_ps(__A), - (__v8sf)_mm256_setzero_ps()); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_div_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_div_pd(__A, __B), - (__v2df)__W); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_div_pd(__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_div_pd(__A, __B), - (__v2df)_mm_setzero_pd()); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_mask_div_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_div_pd(__A, __B), - (__v4df)__W); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_maskz_div_pd(__mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_div_pd(__A, __B), - (__v4df)_mm256_setzero_pd()); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_div_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_div_ps(__A, __B), - (__v4sf)__W); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_div_ps(__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_div_ps(__A, __B), - (__v4sf)_mm_setzero_ps()); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_mask_div_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_div_ps(__A, __B), - (__v8sf)__W); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_maskz_div_ps(__mmask8 __U, __m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_div_ps(__A, __B), - (__v8sf)_mm256_setzero_ps()); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_expand_pd (__m128d __W, __mmask8 __U, __m128d __A) { - return (__m128d) __builtin_ia32_expanddf128_mask ((__v2df) __A, - (__v2df) __W, - (__mmask8) __U); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_expand_pd (__mmask8 __U, __m128d __A) { - return (__m128d) __builtin_ia32_expanddf128_mask ((__v2df) __A, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_mask_expand_pd (__m256d __W, __mmask8 __U, __m256d __A) { - return (__m256d) __builtin_ia32_expanddf256_mask ((__v4df) __A, - (__v4df) __W, - (__mmask8) __U); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_maskz_expand_pd (__mmask8 __U, __m256d __A) { - return (__m256d) __builtin_ia32_expanddf256_mask ((__v4df) __A, - (__v4df) - _mm256_setzero_pd (), - (__mmask8) __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_expand_epi64 (__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_expanddi128_mask ((__v2di) __A, - (__v2di) __W, - (__mmask8) __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_expand_epi64 (__mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_expanddi128_mask ((__v2di) __A, - (__v2di) - _mm_setzero_si128 (), - (__mmask8) __U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_expand_epi64 (__m256i __W, __mmask8 __U, __m256i __A) { - return (__m256i) __builtin_ia32_expanddi256_mask ((__v4di) __A, - (__v4di) __W, - (__mmask8) __U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_expand_epi64 (__mmask8 __U, __m256i __A) { - return (__m256i) __builtin_ia32_expanddi256_mask ((__v4di) __A, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) __U); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_expandloadu_pd (__m128d __W, __mmask8 __U, void const *__P) { - return (__m128d) __builtin_ia32_expandloaddf128_mask ((const __v2df *) __P, - (__v2df) __W, - (__mmask8) - __U); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_expandloadu_pd (__mmask8 __U, void const *__P) { - return (__m128d) __builtin_ia32_expandloaddf128_mask ((const __v2df *) __P, - (__v2df) - _mm_setzero_pd (), - (__mmask8) - __U); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_mask_expandloadu_pd (__m256d __W, __mmask8 __U, void const *__P) { - return (__m256d) __builtin_ia32_expandloaddf256_mask ((const __v4df *) __P, - (__v4df) __W, - (__mmask8) - __U); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_maskz_expandloadu_pd (__mmask8 __U, void const *__P) { - return (__m256d) __builtin_ia32_expandloaddf256_mask ((const __v4df *) __P, - (__v4df) - _mm256_setzero_pd (), - (__mmask8) - __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_expandloadu_epi64 (__m128i __W, __mmask8 __U, void const *__P) { - return (__m128i) __builtin_ia32_expandloaddi128_mask ((const __v2di *) __P, - (__v2di) __W, - (__mmask8) - __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_expandloadu_epi64 (__mmask8 __U, void const *__P) { - return (__m128i) __builtin_ia32_expandloaddi128_mask ((const __v2di *) __P, - (__v2di) - _mm_setzero_si128 (), - (__mmask8) - __U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_expandloadu_epi64 (__m256i __W, __mmask8 __U, - void const *__P) { - return (__m256i) __builtin_ia32_expandloaddi256_mask ((const __v4di *) __P, - (__v4di) __W, - (__mmask8) - __U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_expandloadu_epi64 (__mmask8 __U, void const *__P) { - return (__m256i) __builtin_ia32_expandloaddi256_mask ((const __v4di *) __P, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) - __U); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_expandloadu_ps (__m128 __W, __mmask8 __U, void const *__P) { - return (__m128) __builtin_ia32_expandloadsf128_mask ((const __v4sf *) __P, - (__v4sf) __W, - (__mmask8) __U); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_expandloadu_ps (__mmask8 __U, void const *__P) { - return (__m128) __builtin_ia32_expandloadsf128_mask ((const __v4sf *) __P, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) - __U); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_mask_expandloadu_ps (__m256 __W, __mmask8 __U, void const *__P) { - return (__m256) __builtin_ia32_expandloadsf256_mask ((const __v8sf *) __P, - (__v8sf) __W, - (__mmask8) __U); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_maskz_expandloadu_ps (__mmask8 __U, void const *__P) { - return (__m256) __builtin_ia32_expandloadsf256_mask ((const __v8sf *) __P, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) - __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_expandloadu_epi32 (__m128i __W, __mmask8 __U, void const *__P) { - return (__m128i) __builtin_ia32_expandloadsi128_mask ((const __v4si *) __P, - (__v4si) __W, - (__mmask8) - __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_expandloadu_epi32 (__mmask8 __U, void const *__P) { - return (__m128i) __builtin_ia32_expandloadsi128_mask ((const __v4si *) __P, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_expandloadu_epi32 (__m256i __W, __mmask8 __U, - void const *__P) { - return (__m256i) __builtin_ia32_expandloadsi256_mask ((const __v8si *) __P, - (__v8si) __W, - (__mmask8) - __U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_expandloadu_epi32 (__mmask8 __U, void const *__P) { - return (__m256i) __builtin_ia32_expandloadsi256_mask ((const __v8si *) __P, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) - __U); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_expand_ps (__m128 __W, __mmask8 __U, __m128 __A) { - return (__m128) __builtin_ia32_expandsf128_mask ((__v4sf) __A, - (__v4sf) __W, - (__mmask8) __U); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_expand_ps (__mmask8 __U, __m128 __A) { - return (__m128) __builtin_ia32_expandsf128_mask ((__v4sf) __A, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_mask_expand_ps (__m256 __W, __mmask8 __U, __m256 __A) { - return (__m256) __builtin_ia32_expandsf256_mask ((__v8sf) __A, - (__v8sf) __W, - (__mmask8) __U); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_maskz_expand_ps (__mmask8 __U, __m256 __A) { - return (__m256) __builtin_ia32_expandsf256_mask ((__v8sf) __A, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_expand_epi32 (__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_expandsi128_mask ((__v4si) __A, - (__v4si) __W, - (__mmask8) __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_expand_epi32 (__mmask8 __U, __m128i __A) { - return (__m128i) __builtin_ia32_expandsi128_mask ((__v4si) __A, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_expand_epi32 (__m256i __W, __mmask8 __U, __m256i __A) { - return (__m256i) __builtin_ia32_expandsi256_mask ((__v8si) __A, - (__v8si) __W, - (__mmask8) __U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_expand_epi32 (__mmask8 __U, __m256i __A) { - return (__m256i) __builtin_ia32_expandsi256_mask ((__v8si) __A, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) __U); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_getexp_pd (__m128d __A) { - return (__m128d) __builtin_ia32_getexppd128_mask ((__v2df) __A, - (__v2df) - _mm_setzero_pd (), - (__mmask8) -1); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_getexp_pd (__m128d __W, __mmask8 __U, __m128d __A) { - return (__m128d) __builtin_ia32_getexppd128_mask ((__v2df) __A, - (__v2df) __W, - (__mmask8) __U); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_getexp_pd (__mmask8 __U, __m128d __A) { - return (__m128d) __builtin_ia32_getexppd128_mask ((__v2df) __A, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_getexp_pd (__m256d __A) { - return (__m256d) __builtin_ia32_getexppd256_mask ((__v4df) __A, - (__v4df) - _mm256_setzero_pd (), - (__mmask8) -1); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_mask_getexp_pd (__m256d __W, __mmask8 __U, __m256d __A) { - return (__m256d) __builtin_ia32_getexppd256_mask ((__v4df) __A, - (__v4df) __W, - (__mmask8) __U); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_maskz_getexp_pd (__mmask8 __U, __m256d __A) { - return (__m256d) __builtin_ia32_getexppd256_mask ((__v4df) __A, - (__v4df) - _mm256_setzero_pd (), - (__mmask8) __U); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_getexp_ps (__m128 __A) { - return (__m128) __builtin_ia32_getexpps128_mask ((__v4sf) __A, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) -1); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_getexp_ps (__m128 __W, __mmask8 __U, __m128 __A) { - return (__m128) __builtin_ia32_getexpps128_mask ((__v4sf) __A, - (__v4sf) __W, - (__mmask8) __U); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_getexp_ps (__mmask8 __U, __m128 __A) { - return (__m128) __builtin_ia32_getexpps128_mask ((__v4sf) __A, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_getexp_ps (__m256 __A) { - return (__m256) __builtin_ia32_getexpps256_mask ((__v8sf) __A, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) -1); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_mask_getexp_ps (__m256 __W, __mmask8 __U, __m256 __A) { - return (__m256) __builtin_ia32_getexpps256_mask ((__v8sf) __A, - (__v8sf) __W, - (__mmask8) __U); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_maskz_getexp_ps (__mmask8 __U, __m256 __A) { - return (__m256) __builtin_ia32_getexpps256_mask ((__v8sf) __A, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) __U); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_max_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_max_pd(__A, __B), - (__v2df)__W); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_max_pd(__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_max_pd(__A, __B), - (__v2df)_mm_setzero_pd()); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_mask_max_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_max_pd(__A, __B), - (__v4df)__W); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_maskz_max_pd(__mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_max_pd(__A, __B), - (__v4df)_mm256_setzero_pd()); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_max_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_max_ps(__A, __B), - (__v4sf)__W); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_max_ps(__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_max_ps(__A, __B), - (__v4sf)_mm_setzero_ps()); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_mask_max_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_max_ps(__A, __B), - (__v8sf)__W); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_maskz_max_ps(__mmask8 __U, __m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_max_ps(__A, __B), - (__v8sf)_mm256_setzero_ps()); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_min_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_min_pd(__A, __B), - (__v2df)__W); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_min_pd(__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_min_pd(__A, __B), - (__v2df)_mm_setzero_pd()); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_mask_min_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_min_pd(__A, __B), - (__v4df)__W); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_maskz_min_pd(__mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_min_pd(__A, __B), - (__v4df)_mm256_setzero_pd()); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_min_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_min_ps(__A, __B), - (__v4sf)__W); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_min_ps(__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_min_ps(__A, __B), - (__v4sf)_mm_setzero_ps()); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_mask_min_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_min_ps(__A, __B), - (__v8sf)__W); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_maskz_min_ps(__mmask8 __U, __m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_min_ps(__A, __B), - (__v8sf)_mm256_setzero_ps()); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_mul_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_mul_pd(__A, __B), - (__v2df)__W); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_mul_pd(__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_mul_pd(__A, __B), - (__v2df)_mm_setzero_pd()); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_mask_mul_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_mul_pd(__A, __B), - (__v4df)__W); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_maskz_mul_pd(__mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_mul_pd(__A, __B), - (__v4df)_mm256_setzero_pd()); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_mul_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_mul_ps(__A, __B), - (__v4sf)__W); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_mul_ps(__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_mul_ps(__A, __B), - (__v4sf)_mm_setzero_ps()); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_mask_mul_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_mul_ps(__A, __B), - (__v8sf)__W); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_maskz_mul_ps(__mmask8 __U, __m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_mul_ps(__A, __B), - (__v8sf)_mm256_setzero_ps()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_abs_epi32(__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_abs_epi32(__A), - (__v4si)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_abs_epi32(__mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_abs_epi32(__A), - (__v4si)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_abs_epi32(__m256i __W, __mmask8 __U, __m256i __A) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_abs_epi32(__A), - (__v8si)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_abs_epi32(__mmask8 __U, __m256i __A) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_abs_epi32(__A), - (__v8si)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_abs_epi64 (__m128i __A) { -#if (__clang_major__ < 14) - return (__m128i)__builtin_ia32_pabsq128((__v2di)__A); -#else - return (__m128i)__builtin_elementwise_abs((__v2di)__A); -#endif -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_abs_epi64 (__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_abs_epi64(__A), - (__v2di)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_abs_epi64 (__mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_abs_epi64(__A), - (__v2di)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_abs_epi64 (__m256i __A) { -#if (__clang_major__ < 14) - return (__m256i)__builtin_ia32_pabsq256 ((__v4di)__A); -#else - return (__m256i)__builtin_elementwise_abs((__v4di)__A); -#endif -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_abs_epi64 (__m256i __W, __mmask8 __U, __m256i __A) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_abs_epi64(__A), - (__v4di)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_abs_epi64 (__mmask8 __U, __m256i __A) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_abs_epi64(__A), - (__v4di)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_max_epi32(__mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, - (__v4si)_mm_max_epi32(__A, __B), - (__v4si)_mm_setzero_si128()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_max_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, - (__v4si)_mm_max_epi32(__A, __B), - (__v4si)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_max_epi32(__mmask8 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, - (__v8si)_mm256_max_epi32(__A, __B), - (__v8si)_mm256_setzero_si256()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_max_epi32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, - (__v8si)_mm256_max_epi32(__A, __B), - (__v8si)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_max_epi64 (__m128i __A, __m128i __B) { -#if (__clang_major__ < 14) - return (__m128i)__builtin_ia32_pmaxsq128((__v2di)__A, (__v2di)__B); -#else - return (__m128i)__builtin_elementwise_max((__v2di)__A, (__v2di)__B); -#endif -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_max_epi64 (__mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, - (__v2di)_mm_max_epi64(__A, __B), - (__v2di)_mm_setzero_si128()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_max_epi64 (__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, - (__v2di)_mm_max_epi64(__A, __B), - (__v2di)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_max_epi64 (__m256i __A, __m256i __B) { -#if (__clang_major__ < 14) - return (__m256i)__builtin_ia32_pmaxsq256((__v4di)__A, (__v4di)__B); -#else - return (__m256i)__builtin_elementwise_max((__v4di)__A, (__v4di)__B); -#endif -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_max_epi64 (__mmask8 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, - (__v4di)_mm256_max_epi64(__A, __B), - (__v4di)_mm256_setzero_si256()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_max_epi64 (__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, - (__v4di)_mm256_max_epi64(__A, __B), - (__v4di)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_max_epu32(__mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, - (__v4si)_mm_max_epu32(__A, __B), - (__v4si)_mm_setzero_si128()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_max_epu32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, - (__v4si)_mm_max_epu32(__A, __B), - (__v4si)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_max_epu32(__mmask8 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, - (__v8si)_mm256_max_epu32(__A, __B), - (__v8si)_mm256_setzero_si256()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_max_epu32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, - (__v8si)_mm256_max_epu32(__A, __B), - (__v8si)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_max_epu64 (__m128i __A, __m128i __B) { -#if (__clang_major__ < 14) - return (__m128i)__builtin_ia32_pmaxuq128((__v2di)__A, (__v2di)__B); -#else - return (__m128i)__builtin_elementwise_max((__v2du)__A, (__v2du)__B); -#endif -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_max_epu64 (__mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, - (__v2di)_mm_max_epu64(__A, __B), - (__v2di)_mm_setzero_si128()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_max_epu64 (__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, - (__v2di)_mm_max_epu64(__A, __B), - (__v2di)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_max_epu64 (__m256i __A, __m256i __B) { -#if (__clang_major__ < 14) - return (__m256i)__builtin_ia32_pmaxuq256((__v4di)__A, (__v4di)__B); -#else - return (__m256i)__builtin_elementwise_max((__v4du)__A, (__v4du)__B); -#endif -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_max_epu64 (__mmask8 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, - (__v4di)_mm256_max_epu64(__A, __B), - (__v4di)_mm256_setzero_si256()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_max_epu64 (__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, - (__v4di)_mm256_max_epu64(__A, __B), - (__v4di)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_min_epi32(__mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, - (__v4si)_mm_min_epi32(__A, __B), - (__v4si)_mm_setzero_si128()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_min_epi32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, - (__v4si)_mm_min_epi32(__A, __B), - (__v4si)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_min_epi32(__mmask8 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, - (__v8si)_mm256_min_epi32(__A, __B), - (__v8si)_mm256_setzero_si256()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_min_epi32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, - (__v8si)_mm256_min_epi32(__A, __B), - (__v8si)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_min_epi64 (__m128i __A, __m128i __B) { -#if (__clang_major__ < 14) - return (__m128i)__builtin_ia32_pminsq128((__v2di)__A, (__v2di)__B); -#else - return (__m128i)__builtin_elementwise_min((__v2di)__A, (__v2di)__B); -#endif -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_min_epi64 (__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, - (__v2di)_mm_min_epi64(__A, __B), - (__v2di)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_min_epi64 (__mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, - (__v2di)_mm_min_epi64(__A, __B), - (__v2di)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_min_epi64 (__m256i __A, __m256i __B) { -#if (__clang_major__ < 14) - return (__m256i)__builtin_ia32_pminsq256((__v4di)__A, (__v4di)__B); -#else - return (__m256i)__builtin_elementwise_min((__v4di)__A, (__v4di)__B); -#endif -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_min_epi64 (__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, - (__v4di)_mm256_min_epi64(__A, __B), - (__v4di)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_min_epi64 (__mmask8 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, - (__v4di)_mm256_min_epi64(__A, __B), - (__v4di)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_min_epu32(__mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, - (__v4si)_mm_min_epu32(__A, __B), - (__v4si)_mm_setzero_si128()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_min_epu32(__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, - (__v4si)_mm_min_epu32(__A, __B), - (__v4si)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_min_epu32(__mmask8 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, - (__v8si)_mm256_min_epu32(__A, __B), - (__v8si)_mm256_setzero_si256()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_min_epu32(__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, - (__v8si)_mm256_min_epu32(__A, __B), - (__v8si)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_min_epu64 (__m128i __A, __m128i __B) { -#if (__clang_major__ < 14) - return (__m128i)__builtin_ia32_pminuq128((__v2di)__A, (__v2di)__B); -#else - return (__m128i)__builtin_elementwise_min((__v2du)__A, (__v2du)__B); -#endif -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_min_epu64 (__m128i __W, __mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, - (__v2di)_mm_min_epu64(__A, __B), - (__v2di)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_min_epu64 (__mmask8 __M, __m128i __A, __m128i __B) { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__M, - (__v2di)_mm_min_epu64(__A, __B), - (__v2di)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_min_epu64 (__m256i __A, __m256i __B) { -#if (__clang_major__ < 14) - return (__m256i)__builtin_ia32_pminuq256((__v4di)__A, (__v4di)__B); -#else - return (__m256i)__builtin_elementwise_min((__v4du)__A, (__v4du)__B); -#endif -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_min_epu64 (__m256i __W, __mmask8 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, - (__v4di)_mm256_min_epu64(__A, __B), - (__v4di)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_min_epu64 (__mmask8 __M, __m256i __A, __m256i __B) { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, - (__v4di)_mm256_min_epu64(__A, __B), - (__v4di)_mm256_setzero_si256()); -} - -#define _mm_roundscale_pd(A, imm) \ - ((__m128d)__builtin_ia32_rndscalepd_128_mask((__v2df)(__m128d)(A), \ - (int)(imm), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1)) - - -#define _mm_mask_roundscale_pd(W, U, A, imm) \ - ((__m128d)__builtin_ia32_rndscalepd_128_mask((__v2df)(__m128d)(A), \ - (int)(imm), \ - (__v2df)(__m128d)(W), \ - (__mmask8)(U))) - - -#define _mm_maskz_roundscale_pd(U, A, imm) \ - ((__m128d)__builtin_ia32_rndscalepd_128_mask((__v2df)(__m128d)(A), \ - (int)(imm), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U))) - - -#define _mm256_roundscale_pd(A, imm) \ - ((__m256d)__builtin_ia32_rndscalepd_256_mask((__v4df)(__m256d)(A), \ - (int)(imm), \ - (__v4df)_mm256_setzero_pd(), \ - (__mmask8)-1)) - - -#define _mm256_mask_roundscale_pd(W, U, A, imm) \ - ((__m256d)__builtin_ia32_rndscalepd_256_mask((__v4df)(__m256d)(A), \ - (int)(imm), \ - (__v4df)(__m256d)(W), \ - (__mmask8)(U))) - - -#define _mm256_maskz_roundscale_pd(U, A, imm) \ - ((__m256d)__builtin_ia32_rndscalepd_256_mask((__v4df)(__m256d)(A), \ - (int)(imm), \ - (__v4df)_mm256_setzero_pd(), \ - (__mmask8)(U))) - -#define _mm_roundscale_ps(A, imm) \ - ((__m128)__builtin_ia32_rndscaleps_128_mask((__v4sf)(__m128)(A), (int)(imm), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)-1)) - - -#define _mm_mask_roundscale_ps(W, U, A, imm) \ - ((__m128)__builtin_ia32_rndscaleps_128_mask((__v4sf)(__m128)(A), (int)(imm), \ - (__v4sf)(__m128)(W), \ - (__mmask8)(U))) - - -#define _mm_maskz_roundscale_ps(U, A, imm) \ - ((__m128)__builtin_ia32_rndscaleps_128_mask((__v4sf)(__m128)(A), (int)(imm), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U))) - -#define _mm256_roundscale_ps(A, imm) \ - ((__m256)__builtin_ia32_rndscaleps_256_mask((__v8sf)(__m256)(A), (int)(imm), \ - (__v8sf)_mm256_setzero_ps(), \ - (__mmask8)-1)) - -#define _mm256_mask_roundscale_ps(W, U, A, imm) \ - ((__m256)__builtin_ia32_rndscaleps_256_mask((__v8sf)(__m256)(A), (int)(imm), \ - (__v8sf)(__m256)(W), \ - (__mmask8)(U))) - - -#define _mm256_maskz_roundscale_ps(U, A, imm) \ - ((__m256)__builtin_ia32_rndscaleps_256_mask((__v8sf)(__m256)(A), (int)(imm), \ - (__v8sf)_mm256_setzero_ps(), \ - (__mmask8)(U))) - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_scalef_pd (__m128d __A, __m128d __B) { - return (__m128d) __builtin_ia32_scalefpd128_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) - _mm_setzero_pd (), - (__mmask8) -1); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_scalef_pd (__m128d __W, __mmask8 __U, __m128d __A, - __m128d __B) { - return (__m128d) __builtin_ia32_scalefpd128_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) __W, - (__mmask8) __U); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_scalef_pd (__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d) __builtin_ia32_scalefpd128_mask ((__v2df) __A, - (__v2df) __B, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_scalef_pd (__m256d __A, __m256d __B) { - return (__m256d) __builtin_ia32_scalefpd256_mask ((__v4df) __A, - (__v4df) __B, - (__v4df) - _mm256_setzero_pd (), - (__mmask8) -1); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_mask_scalef_pd (__m256d __W, __mmask8 __U, __m256d __A, - __m256d __B) { - return (__m256d) __builtin_ia32_scalefpd256_mask ((__v4df) __A, - (__v4df) __B, - (__v4df) __W, - (__mmask8) __U); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_maskz_scalef_pd (__mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d) __builtin_ia32_scalefpd256_mask ((__v4df) __A, - (__v4df) __B, - (__v4df) - _mm256_setzero_pd (), - (__mmask8) __U); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_scalef_ps (__m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_scalefps128_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) -1); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_scalef_ps (__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_scalefps128_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) __W, - (__mmask8) __U); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_scalef_ps (__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128) __builtin_ia32_scalefps128_mask ((__v4sf) __A, - (__v4sf) __B, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_scalef_ps (__m256 __A, __m256 __B) { - return (__m256) __builtin_ia32_scalefps256_mask ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) -1); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_mask_scalef_ps (__m256 __W, __mmask8 __U, __m256 __A, - __m256 __B) { - return (__m256) __builtin_ia32_scalefps256_mask ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) __W, - (__mmask8) __U); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_maskz_scalef_ps (__mmask8 __U, __m256 __A, __m256 __B) { - return (__m256) __builtin_ia32_scalefps256_mask ((__v8sf) __A, - (__v8sf) __B, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) __U); -} - -#define _mm_i64scatter_pd(addr, index, v1, scale) \ - __builtin_ia32_scatterdiv2df((void *)(addr), (__mmask8)-1, \ - (__v2di)(__m128i)(index), \ - (__v2df)(__m128d)(v1), (int)(scale)) - -#define _mm_mask_i64scatter_pd(addr, mask, index, v1, scale) \ - __builtin_ia32_scatterdiv2df((void *)(addr), (__mmask8)(mask), \ - (__v2di)(__m128i)(index), \ - (__v2df)(__m128d)(v1), (int)(scale)) - -#define _mm_i64scatter_epi64(addr, index, v1, scale) \ - __builtin_ia32_scatterdiv2di((void *)(addr), (__mmask8)-1, \ - (__v2di)(__m128i)(index), \ - (__v2di)(__m128i)(v1), (int)(scale)) - -#define _mm_mask_i64scatter_epi64(addr, mask, index, v1, scale) \ - __builtin_ia32_scatterdiv2di((void *)(addr), (__mmask8)(mask), \ - (__v2di)(__m128i)(index), \ - (__v2di)(__m128i)(v1), (int)(scale)) - -#define _mm256_i64scatter_pd(addr, index, v1, scale) \ - __builtin_ia32_scatterdiv4df((void *)(addr), (__mmask8)-1, \ - (__v4di)(__m256i)(index), \ - (__v4df)(__m256d)(v1), (int)(scale)) - -#define _mm256_mask_i64scatter_pd(addr, mask, index, v1, scale) \ - __builtin_ia32_scatterdiv4df((void *)(addr), (__mmask8)(mask), \ - (__v4di)(__m256i)(index), \ - (__v4df)(__m256d)(v1), (int)(scale)) - -#define _mm256_i64scatter_epi64(addr, index, v1, scale) \ - __builtin_ia32_scatterdiv4di((void *)(addr), (__mmask8)-1, \ - (__v4di)(__m256i)(index), \ - (__v4di)(__m256i)(v1), (int)(scale)) - -#define _mm256_mask_i64scatter_epi64(addr, mask, index, v1, scale) \ - __builtin_ia32_scatterdiv4di((void *)(addr), (__mmask8)(mask), \ - (__v4di)(__m256i)(index), \ - (__v4di)(__m256i)(v1), (int)(scale)) - -#define _mm_i64scatter_ps(addr, index, v1, scale) \ - __builtin_ia32_scatterdiv4sf((void *)(addr), (__mmask8)-1, \ - (__v2di)(__m128i)(index), (__v4sf)(__m128)(v1), \ - (int)(scale)) - -#define _mm_mask_i64scatter_ps(addr, mask, index, v1, scale) \ - __builtin_ia32_scatterdiv4sf((void *)(addr), (__mmask8)(mask), \ - (__v2di)(__m128i)(index), (__v4sf)(__m128)(v1), \ - (int)(scale)) - -#define _mm_i64scatter_epi32(addr, index, v1, scale) \ - __builtin_ia32_scatterdiv4si((void *)(addr), (__mmask8)-1, \ - (__v2di)(__m128i)(index), \ - (__v4si)(__m128i)(v1), (int)(scale)) - -#define _mm_mask_i64scatter_epi32(addr, mask, index, v1, scale) \ - __builtin_ia32_scatterdiv4si((void *)(addr), (__mmask8)(mask), \ - (__v2di)(__m128i)(index), \ - (__v4si)(__m128i)(v1), (int)(scale)) - -#define _mm256_i64scatter_ps(addr, index, v1, scale) \ - __builtin_ia32_scatterdiv8sf((void *)(addr), (__mmask8)-1, \ - (__v4di)(__m256i)(index), (__v4sf)(__m128)(v1), \ - (int)(scale)) - -#define _mm256_mask_i64scatter_ps(addr, mask, index, v1, scale) \ - __builtin_ia32_scatterdiv8sf((void *)(addr), (__mmask8)(mask), \ - (__v4di)(__m256i)(index), (__v4sf)(__m128)(v1), \ - (int)(scale)) - -#define _mm256_i64scatter_epi32(addr, index, v1, scale) \ - __builtin_ia32_scatterdiv8si((void *)(addr), (__mmask8)-1, \ - (__v4di)(__m256i)(index), \ - (__v4si)(__m128i)(v1), (int)(scale)) - -#define _mm256_mask_i64scatter_epi32(addr, mask, index, v1, scale) \ - __builtin_ia32_scatterdiv8si((void *)(addr), (__mmask8)(mask), \ - (__v4di)(__m256i)(index), \ - (__v4si)(__m128i)(v1), (int)(scale)) - -#define _mm_i32scatter_pd(addr, index, v1, scale) \ - __builtin_ia32_scattersiv2df((void *)(addr), (__mmask8)-1, \ - (__v4si)(__m128i)(index), \ - (__v2df)(__m128d)(v1), (int)(scale)) - -#define _mm_mask_i32scatter_pd(addr, mask, index, v1, scale) \ - __builtin_ia32_scattersiv2df((void *)(addr), (__mmask8)(mask), \ - (__v4si)(__m128i)(index), \ - (__v2df)(__m128d)(v1), (int)(scale)) - -#define _mm_i32scatter_epi64(addr, index, v1, scale) \ - __builtin_ia32_scattersiv2di((void *)(addr), (__mmask8)-1, \ - (__v4si)(__m128i)(index), \ - (__v2di)(__m128i)(v1), (int)(scale)) - -#define _mm_mask_i32scatter_epi64(addr, mask, index, v1, scale) \ - __builtin_ia32_scattersiv2di((void *)(addr), (__mmask8)(mask), \ - (__v4si)(__m128i)(index), \ - (__v2di)(__m128i)(v1), (int)(scale)) - -#define _mm256_i32scatter_pd(addr, index, v1, scale) \ - __builtin_ia32_scattersiv4df((void *)(addr), (__mmask8)-1, \ - (__v4si)(__m128i)(index), \ - (__v4df)(__m256d)(v1), (int)(scale)) - -#define _mm256_mask_i32scatter_pd(addr, mask, index, v1, scale) \ - __builtin_ia32_scattersiv4df((void *)(addr), (__mmask8)(mask), \ - (__v4si)(__m128i)(index), \ - (__v4df)(__m256d)(v1), (int)(scale)) - -#define _mm256_i32scatter_epi64(addr, index, v1, scale) \ - __builtin_ia32_scattersiv4di((void *)(addr), (__mmask8)-1, \ - (__v4si)(__m128i)(index), \ - (__v4di)(__m256i)(v1), (int)(scale)) - -#define _mm256_mask_i32scatter_epi64(addr, mask, index, v1, scale) \ - __builtin_ia32_scattersiv4di((void *)(addr), (__mmask8)(mask), \ - (__v4si)(__m128i)(index), \ - (__v4di)(__m256i)(v1), (int)(scale)) - -#define _mm_i32scatter_ps(addr, index, v1, scale) \ - __builtin_ia32_scattersiv4sf((void *)(addr), (__mmask8)-1, \ - (__v4si)(__m128i)(index), (__v4sf)(__m128)(v1), \ - (int)(scale)) - -#define _mm_mask_i32scatter_ps(addr, mask, index, v1, scale) \ - __builtin_ia32_scattersiv4sf((void *)(addr), (__mmask8)(mask), \ - (__v4si)(__m128i)(index), (__v4sf)(__m128)(v1), \ - (int)(scale)) - -#define _mm_i32scatter_epi32(addr, index, v1, scale) \ - __builtin_ia32_scattersiv4si((void *)(addr), (__mmask8)-1, \ - (__v4si)(__m128i)(index), \ - (__v4si)(__m128i)(v1), (int)(scale)) - -#define _mm_mask_i32scatter_epi32(addr, mask, index, v1, scale) \ - __builtin_ia32_scattersiv4si((void *)(addr), (__mmask8)(mask), \ - (__v4si)(__m128i)(index), \ - (__v4si)(__m128i)(v1), (int)(scale)) - -#define _mm256_i32scatter_ps(addr, index, v1, scale) \ - __builtin_ia32_scattersiv8sf((void *)(addr), (__mmask8)-1, \ - (__v8si)(__m256i)(index), (__v8sf)(__m256)(v1), \ - (int)(scale)) - -#define _mm256_mask_i32scatter_ps(addr, mask, index, v1, scale) \ - __builtin_ia32_scattersiv8sf((void *)(addr), (__mmask8)(mask), \ - (__v8si)(__m256i)(index), (__v8sf)(__m256)(v1), \ - (int)(scale)) - -#define _mm256_i32scatter_epi32(addr, index, v1, scale) \ - __builtin_ia32_scattersiv8si((void *)(addr), (__mmask8)-1, \ - (__v8si)(__m256i)(index), \ - (__v8si)(__m256i)(v1), (int)(scale)) - -#define _mm256_mask_i32scatter_epi32(addr, mask, index, v1, scale) \ - __builtin_ia32_scattersiv8si((void *)(addr), (__mmask8)(mask), \ - (__v8si)(__m256i)(index), \ - (__v8si)(__m256i)(v1), (int)(scale)) - - static __inline__ __m128d __DEFAULT_FN_ATTRS128 - _mm_mask_sqrt_pd(__m128d __W, __mmask8 __U, __m128d __A) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_sqrt_pd(__A), - (__v2df)__W); - } - - static __inline__ __m128d __DEFAULT_FN_ATTRS128 - _mm_maskz_sqrt_pd(__mmask8 __U, __m128d __A) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_sqrt_pd(__A), - (__v2df)_mm_setzero_pd()); - } - - static __inline__ __m256d __DEFAULT_FN_ATTRS256 - _mm256_mask_sqrt_pd(__m256d __W, __mmask8 __U, __m256d __A) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_sqrt_pd(__A), - (__v4df)__W); - } - - static __inline__ __m256d __DEFAULT_FN_ATTRS256 - _mm256_maskz_sqrt_pd(__mmask8 __U, __m256d __A) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_sqrt_pd(__A), - (__v4df)_mm256_setzero_pd()); - } - - static __inline__ __m128 __DEFAULT_FN_ATTRS128 - _mm_mask_sqrt_ps(__m128 __W, __mmask8 __U, __m128 __A) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_sqrt_ps(__A), - (__v4sf)__W); - } - - static __inline__ __m128 __DEFAULT_FN_ATTRS128 - _mm_maskz_sqrt_ps(__mmask8 __U, __m128 __A) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_sqrt_ps(__A), - (__v4sf)_mm_setzero_ps()); - } - - static __inline__ __m256 __DEFAULT_FN_ATTRS256 - _mm256_mask_sqrt_ps(__m256 __W, __mmask8 __U, __m256 __A) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_sqrt_ps(__A), - (__v8sf)__W); - } - - static __inline__ __m256 __DEFAULT_FN_ATTRS256 - _mm256_maskz_sqrt_ps(__mmask8 __U, __m256 __A) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_sqrt_ps(__A), - (__v8sf)_mm256_setzero_ps()); - } - - static __inline__ __m128d __DEFAULT_FN_ATTRS128 - _mm_mask_sub_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_sub_pd(__A, __B), - (__v2df)__W); - } - - static __inline__ __m128d __DEFAULT_FN_ATTRS128 - _mm_maskz_sub_pd(__mmask8 __U, __m128d __A, __m128d __B) { - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_sub_pd(__A, __B), - (__v2df)_mm_setzero_pd()); - } - - static __inline__ __m256d __DEFAULT_FN_ATTRS256 - _mm256_mask_sub_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_sub_pd(__A, __B), - (__v4df)__W); - } - - static __inline__ __m256d __DEFAULT_FN_ATTRS256 - _mm256_maskz_sub_pd(__mmask8 __U, __m256d __A, __m256d __B) { - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_sub_pd(__A, __B), - (__v4df)_mm256_setzero_pd()); - } - - static __inline__ __m128 __DEFAULT_FN_ATTRS128 - _mm_mask_sub_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_sub_ps(__A, __B), - (__v4sf)__W); - } - - static __inline__ __m128 __DEFAULT_FN_ATTRS128 - _mm_maskz_sub_ps(__mmask8 __U, __m128 __A, __m128 __B) { - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_sub_ps(__A, __B), - (__v4sf)_mm_setzero_ps()); - } - - static __inline__ __m256 __DEFAULT_FN_ATTRS256 - _mm256_mask_sub_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_sub_ps(__A, __B), - (__v8sf)__W); - } - - static __inline__ __m256 __DEFAULT_FN_ATTRS256 - _mm256_maskz_sub_ps(__mmask8 __U, __m256 __A, __m256 __B) { - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_sub_ps(__A, __B), - (__v8sf)_mm256_setzero_ps()); - } - - static __inline__ __m128i __DEFAULT_FN_ATTRS128 - _mm_permutex2var_epi32(__m128i __A, __m128i __I, __m128i __B) { - return (__m128i)__builtin_ia32_vpermi2vard128((__v4si) __A, (__v4si)__I, - (__v4si)__B); - } - - static __inline__ __m128i __DEFAULT_FN_ATTRS128 - _mm_mask_permutex2var_epi32(__m128i __A, __mmask8 __U, __m128i __I, - __m128i __B) { - return (__m128i)__builtin_ia32_selectd_128(__U, - (__v4si)_mm_permutex2var_epi32(__A, __I, __B), - (__v4si)__A); - } - - static __inline__ __m128i __DEFAULT_FN_ATTRS128 - _mm_mask2_permutex2var_epi32(__m128i __A, __m128i __I, __mmask8 __U, - __m128i __B) { - return (__m128i)__builtin_ia32_selectd_128(__U, - (__v4si)_mm_permutex2var_epi32(__A, __I, __B), - (__v4si)__I); - } - - static __inline__ __m128i __DEFAULT_FN_ATTRS128 - _mm_maskz_permutex2var_epi32(__mmask8 __U, __m128i __A, __m128i __I, - __m128i __B) { - return (__m128i)__builtin_ia32_selectd_128(__U, - (__v4si)_mm_permutex2var_epi32(__A, __I, __B), - (__v4si)_mm_setzero_si128()); - } - - static __inline__ __m256i __DEFAULT_FN_ATTRS256 - _mm256_permutex2var_epi32(__m256i __A, __m256i __I, __m256i __B) { - return (__m256i)__builtin_ia32_vpermi2vard256((__v8si)__A, (__v8si) __I, - (__v8si) __B); - } - - static __inline__ __m256i __DEFAULT_FN_ATTRS256 - _mm256_mask_permutex2var_epi32(__m256i __A, __mmask8 __U, __m256i __I, - __m256i __B) { - return (__m256i)__builtin_ia32_selectd_256(__U, - (__v8si)_mm256_permutex2var_epi32(__A, __I, __B), - (__v8si)__A); - } - - static __inline__ __m256i __DEFAULT_FN_ATTRS256 - _mm256_mask2_permutex2var_epi32(__m256i __A, __m256i __I, __mmask8 __U, - __m256i __B) { - return (__m256i)__builtin_ia32_selectd_256(__U, - (__v8si)_mm256_permutex2var_epi32(__A, __I, __B), - (__v8si)__I); - } - - static __inline__ __m256i __DEFAULT_FN_ATTRS256 - _mm256_maskz_permutex2var_epi32(__mmask8 __U, __m256i __A, __m256i __I, - __m256i __B) { - return (__m256i)__builtin_ia32_selectd_256(__U, - (__v8si)_mm256_permutex2var_epi32(__A, __I, __B), - (__v8si)_mm256_setzero_si256()); - } - - static __inline__ __m128d __DEFAULT_FN_ATTRS128 - _mm_permutex2var_pd(__m128d __A, __m128i __I, __m128d __B) { - return (__m128d)__builtin_ia32_vpermi2varpd128((__v2df)__A, (__v2di)__I, - (__v2df)__B); - } - - static __inline__ __m128d __DEFAULT_FN_ATTRS128 - _mm_mask_permutex2var_pd(__m128d __A, __mmask8 __U, __m128i __I, __m128d __B) { - return (__m128d)__builtin_ia32_selectpd_128(__U, - (__v2df)_mm_permutex2var_pd(__A, __I, __B), - (__v2df)__A); - } - - static __inline__ __m128d __DEFAULT_FN_ATTRS128 - _mm_mask2_permutex2var_pd(__m128d __A, __m128i __I, __mmask8 __U, __m128d __B) { - return (__m128d)__builtin_ia32_selectpd_128(__U, - (__v2df)_mm_permutex2var_pd(__A, __I, __B), - (__v2df)(__m128d)__I); - } - - static __inline__ __m128d __DEFAULT_FN_ATTRS128 - _mm_maskz_permutex2var_pd(__mmask8 __U, __m128d __A, __m128i __I, __m128d __B) { - return (__m128d)__builtin_ia32_selectpd_128(__U, - (__v2df)_mm_permutex2var_pd(__A, __I, __B), - (__v2df)_mm_setzero_pd()); - } - - static __inline__ __m256d __DEFAULT_FN_ATTRS256 - _mm256_permutex2var_pd(__m256d __A, __m256i __I, __m256d __B) { - return (__m256d)__builtin_ia32_vpermi2varpd256((__v4df)__A, (__v4di)__I, - (__v4df)__B); - } - - static __inline__ __m256d __DEFAULT_FN_ATTRS256 - _mm256_mask_permutex2var_pd(__m256d __A, __mmask8 __U, __m256i __I, - __m256d __B) { - return (__m256d)__builtin_ia32_selectpd_256(__U, - (__v4df)_mm256_permutex2var_pd(__A, __I, __B), - (__v4df)__A); - } - - static __inline__ __m256d __DEFAULT_FN_ATTRS256 - _mm256_mask2_permutex2var_pd(__m256d __A, __m256i __I, __mmask8 __U, - __m256d __B) { - return (__m256d)__builtin_ia32_selectpd_256(__U, - (__v4df)_mm256_permutex2var_pd(__A, __I, __B), - (__v4df)(__m256d)__I); - } - - static __inline__ __m256d __DEFAULT_FN_ATTRS256 - _mm256_maskz_permutex2var_pd(__mmask8 __U, __m256d __A, __m256i __I, - __m256d __B) { - return (__m256d)__builtin_ia32_selectpd_256(__U, - (__v4df)_mm256_permutex2var_pd(__A, __I, __B), - (__v4df)_mm256_setzero_pd()); - } - - static __inline__ __m128 __DEFAULT_FN_ATTRS128 - _mm_permutex2var_ps(__m128 __A, __m128i __I, __m128 __B) { - return (__m128)__builtin_ia32_vpermi2varps128((__v4sf)__A, (__v4si)__I, - (__v4sf)__B); - } - - static __inline__ __m128 __DEFAULT_FN_ATTRS128 - _mm_mask_permutex2var_ps(__m128 __A, __mmask8 __U, __m128i __I, __m128 __B) { - return (__m128)__builtin_ia32_selectps_128(__U, - (__v4sf)_mm_permutex2var_ps(__A, __I, __B), - (__v4sf)__A); - } - - static __inline__ __m128 __DEFAULT_FN_ATTRS128 - _mm_mask2_permutex2var_ps(__m128 __A, __m128i __I, __mmask8 __U, __m128 __B) { - return (__m128)__builtin_ia32_selectps_128(__U, - (__v4sf)_mm_permutex2var_ps(__A, __I, __B), - (__v4sf)(__m128)__I); - } - - static __inline__ __m128 __DEFAULT_FN_ATTRS128 - _mm_maskz_permutex2var_ps(__mmask8 __U, __m128 __A, __m128i __I, __m128 __B) { - return (__m128)__builtin_ia32_selectps_128(__U, - (__v4sf)_mm_permutex2var_ps(__A, __I, __B), - (__v4sf)_mm_setzero_ps()); - } - - static __inline__ __m256 __DEFAULT_FN_ATTRS256 - _mm256_permutex2var_ps(__m256 __A, __m256i __I, __m256 __B) { - return (__m256)__builtin_ia32_vpermi2varps256((__v8sf)__A, (__v8si)__I, - (__v8sf) __B); - } - - static __inline__ __m256 __DEFAULT_FN_ATTRS256 - _mm256_mask_permutex2var_ps(__m256 __A, __mmask8 __U, __m256i __I, __m256 __B) { - return (__m256)__builtin_ia32_selectps_256(__U, - (__v8sf)_mm256_permutex2var_ps(__A, __I, __B), - (__v8sf)__A); - } - - static __inline__ __m256 __DEFAULT_FN_ATTRS256 - _mm256_mask2_permutex2var_ps(__m256 __A, __m256i __I, __mmask8 __U, - __m256 __B) { - return (__m256)__builtin_ia32_selectps_256(__U, - (__v8sf)_mm256_permutex2var_ps(__A, __I, __B), - (__v8sf)(__m256)__I); - } - - static __inline__ __m256 __DEFAULT_FN_ATTRS256 - _mm256_maskz_permutex2var_ps(__mmask8 __U, __m256 __A, __m256i __I, - __m256 __B) { - return (__m256)__builtin_ia32_selectps_256(__U, - (__v8sf)_mm256_permutex2var_ps(__A, __I, __B), - (__v8sf)_mm256_setzero_ps()); - } - - static __inline__ __m128i __DEFAULT_FN_ATTRS128 - _mm_permutex2var_epi64(__m128i __A, __m128i __I, __m128i __B) { - return (__m128i)__builtin_ia32_vpermi2varq128((__v2di)__A, (__v2di)__I, - (__v2di)__B); - } - - static __inline__ __m128i __DEFAULT_FN_ATTRS128 - _mm_mask_permutex2var_epi64(__m128i __A, __mmask8 __U, __m128i __I, - __m128i __B) { - return (__m128i)__builtin_ia32_selectq_128(__U, - (__v2di)_mm_permutex2var_epi64(__A, __I, __B), - (__v2di)__A); - } - - static __inline__ __m128i __DEFAULT_FN_ATTRS128 - _mm_mask2_permutex2var_epi64(__m128i __A, __m128i __I, __mmask8 __U, - __m128i __B) { - return (__m128i)__builtin_ia32_selectq_128(__U, - (__v2di)_mm_permutex2var_epi64(__A, __I, __B), - (__v2di)__I); - } - - static __inline__ __m128i __DEFAULT_FN_ATTRS128 - _mm_maskz_permutex2var_epi64(__mmask8 __U, __m128i __A, __m128i __I, - __m128i __B) { - return (__m128i)__builtin_ia32_selectq_128(__U, - (__v2di)_mm_permutex2var_epi64(__A, __I, __B), - (__v2di)_mm_setzero_si128()); - } - - - static __inline__ __m256i __DEFAULT_FN_ATTRS256 - _mm256_permutex2var_epi64(__m256i __A, __m256i __I, __m256i __B) { - return (__m256i)__builtin_ia32_vpermi2varq256((__v4di)__A, (__v4di) __I, - (__v4di) __B); - } - - static __inline__ __m256i __DEFAULT_FN_ATTRS256 - _mm256_mask_permutex2var_epi64(__m256i __A, __mmask8 __U, __m256i __I, - __m256i __B) { - return (__m256i)__builtin_ia32_selectq_256(__U, - (__v4di)_mm256_permutex2var_epi64(__A, __I, __B), - (__v4di)__A); - } - - static __inline__ __m256i __DEFAULT_FN_ATTRS256 - _mm256_mask2_permutex2var_epi64(__m256i __A, __m256i __I, __mmask8 __U, - __m256i __B) { - return (__m256i)__builtin_ia32_selectq_256(__U, - (__v4di)_mm256_permutex2var_epi64(__A, __I, __B), - (__v4di)__I); - } - - static __inline__ __m256i __DEFAULT_FN_ATTRS256 - _mm256_maskz_permutex2var_epi64(__mmask8 __U, __m256i __A, __m256i __I, - __m256i __B) { - return (__m256i)__builtin_ia32_selectq_256(__U, - (__v4di)_mm256_permutex2var_epi64(__A, __I, __B), - (__v4di)_mm256_setzero_si256()); - } - - static __inline__ __m128i __DEFAULT_FN_ATTRS128 - _mm_mask_cvtepi8_epi32(__m128i __W, __mmask8 __U, __m128i __A) - { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_cvtepi8_epi32(__A), - (__v4si)__W); - } - - static __inline__ __m128i __DEFAULT_FN_ATTRS128 - _mm_maskz_cvtepi8_epi32(__mmask8 __U, __m128i __A) - { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_cvtepi8_epi32(__A), - (__v4si)_mm_setzero_si128()); - } - - static __inline__ __m256i __DEFAULT_FN_ATTRS256 - _mm256_mask_cvtepi8_epi32 (__m256i __W, __mmask8 __U, __m128i __A) - { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_cvtepi8_epi32(__A), - (__v8si)__W); - } - - static __inline__ __m256i __DEFAULT_FN_ATTRS256 - _mm256_maskz_cvtepi8_epi32 (__mmask8 __U, __m128i __A) - { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_cvtepi8_epi32(__A), - (__v8si)_mm256_setzero_si256()); - } - - static __inline__ __m128i __DEFAULT_FN_ATTRS128 - _mm_mask_cvtepi8_epi64(__m128i __W, __mmask8 __U, __m128i __A) - { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_cvtepi8_epi64(__A), - (__v2di)__W); - } - - static __inline__ __m128i __DEFAULT_FN_ATTRS128 - _mm_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A) - { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_cvtepi8_epi64(__A), - (__v2di)_mm_setzero_si128()); - } - - static __inline__ __m256i __DEFAULT_FN_ATTRS256 - _mm256_mask_cvtepi8_epi64(__m256i __W, __mmask8 __U, __m128i __A) - { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_cvtepi8_epi64(__A), - (__v4di)__W); - } - - static __inline__ __m256i __DEFAULT_FN_ATTRS256 - _mm256_maskz_cvtepi8_epi64(__mmask8 __U, __m128i __A) - { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_cvtepi8_epi64(__A), - (__v4di)_mm256_setzero_si256()); - } - - static __inline__ __m128i __DEFAULT_FN_ATTRS128 - _mm_mask_cvtepi32_epi64(__m128i __W, __mmask8 __U, __m128i __X) - { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_cvtepi32_epi64(__X), - (__v2di)__W); - } - - static __inline__ __m128i __DEFAULT_FN_ATTRS128 - _mm_maskz_cvtepi32_epi64(__mmask8 __U, __m128i __X) - { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_cvtepi32_epi64(__X), - (__v2di)_mm_setzero_si128()); - } - - static __inline__ __m256i __DEFAULT_FN_ATTRS256 - _mm256_mask_cvtepi32_epi64(__m256i __W, __mmask8 __U, __m128i __X) - { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_cvtepi32_epi64(__X), - (__v4di)__W); - } - - static __inline__ __m256i __DEFAULT_FN_ATTRS256 - _mm256_maskz_cvtepi32_epi64(__mmask8 __U, __m128i __X) - { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_cvtepi32_epi64(__X), - (__v4di)_mm256_setzero_si256()); - } - - static __inline__ __m128i __DEFAULT_FN_ATTRS128 - _mm_mask_cvtepi16_epi32(__m128i __W, __mmask8 __U, __m128i __A) - { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_cvtepi16_epi32(__A), - (__v4si)__W); - } - - static __inline__ __m128i __DEFAULT_FN_ATTRS128 - _mm_maskz_cvtepi16_epi32(__mmask8 __U, __m128i __A) - { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_cvtepi16_epi32(__A), - (__v4si)_mm_setzero_si128()); - } - - static __inline__ __m256i __DEFAULT_FN_ATTRS256 - _mm256_mask_cvtepi16_epi32(__m256i __W, __mmask8 __U, __m128i __A) - { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_cvtepi16_epi32(__A), - (__v8si)__W); - } - - static __inline__ __m256i __DEFAULT_FN_ATTRS256 - _mm256_maskz_cvtepi16_epi32 (__mmask8 __U, __m128i __A) - { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_cvtepi16_epi32(__A), - (__v8si)_mm256_setzero_si256()); - } - - static __inline__ __m128i __DEFAULT_FN_ATTRS128 - _mm_mask_cvtepi16_epi64(__m128i __W, __mmask8 __U, __m128i __A) - { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_cvtepi16_epi64(__A), - (__v2di)__W); - } - - static __inline__ __m128i __DEFAULT_FN_ATTRS128 - _mm_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A) - { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_cvtepi16_epi64(__A), - (__v2di)_mm_setzero_si128()); - } - - static __inline__ __m256i __DEFAULT_FN_ATTRS256 - _mm256_mask_cvtepi16_epi64(__m256i __W, __mmask8 __U, __m128i __A) - { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_cvtepi16_epi64(__A), - (__v4di)__W); - } - - static __inline__ __m256i __DEFAULT_FN_ATTRS256 - _mm256_maskz_cvtepi16_epi64(__mmask8 __U, __m128i __A) - { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_cvtepi16_epi64(__A), - (__v4di)_mm256_setzero_si256()); - } - - - static __inline__ __m128i __DEFAULT_FN_ATTRS128 - _mm_mask_cvtepu8_epi32(__m128i __W, __mmask8 __U, __m128i __A) - { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_cvtepu8_epi32(__A), - (__v4si)__W); - } - - static __inline__ __m128i __DEFAULT_FN_ATTRS128 - _mm_maskz_cvtepu8_epi32(__mmask8 __U, __m128i __A) - { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_cvtepu8_epi32(__A), - (__v4si)_mm_setzero_si128()); - } - - static __inline__ __m256i __DEFAULT_FN_ATTRS256 - _mm256_mask_cvtepu8_epi32(__m256i __W, __mmask8 __U, __m128i __A) - { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_cvtepu8_epi32(__A), - (__v8si)__W); - } - - static __inline__ __m256i __DEFAULT_FN_ATTRS256 - _mm256_maskz_cvtepu8_epi32(__mmask8 __U, __m128i __A) - { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_cvtepu8_epi32(__A), - (__v8si)_mm256_setzero_si256()); - } - - static __inline__ __m128i __DEFAULT_FN_ATTRS128 - _mm_mask_cvtepu8_epi64(__m128i __W, __mmask8 __U, __m128i __A) - { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_cvtepu8_epi64(__A), - (__v2di)__W); - } - - static __inline__ __m128i __DEFAULT_FN_ATTRS128 - _mm_maskz_cvtepu8_epi64(__mmask8 __U, __m128i __A) - { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_cvtepu8_epi64(__A), - (__v2di)_mm_setzero_si128()); - } - - static __inline__ __m256i __DEFAULT_FN_ATTRS256 - _mm256_mask_cvtepu8_epi64(__m256i __W, __mmask8 __U, __m128i __A) - { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_cvtepu8_epi64(__A), - (__v4di)__W); - } - - static __inline__ __m256i __DEFAULT_FN_ATTRS256 - _mm256_maskz_cvtepu8_epi64 (__mmask8 __U, __m128i __A) - { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_cvtepu8_epi64(__A), - (__v4di)_mm256_setzero_si256()); - } - - static __inline__ __m128i __DEFAULT_FN_ATTRS128 - _mm_mask_cvtepu32_epi64(__m128i __W, __mmask8 __U, __m128i __X) - { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_cvtepu32_epi64(__X), - (__v2di)__W); - } - - static __inline__ __m128i __DEFAULT_FN_ATTRS128 - _mm_maskz_cvtepu32_epi64(__mmask8 __U, __m128i __X) - { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_cvtepu32_epi64(__X), - (__v2di)_mm_setzero_si128()); - } - - static __inline__ __m256i __DEFAULT_FN_ATTRS256 - _mm256_mask_cvtepu32_epi64(__m256i __W, __mmask8 __U, __m128i __X) - { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_cvtepu32_epi64(__X), - (__v4di)__W); - } - - static __inline__ __m256i __DEFAULT_FN_ATTRS256 - _mm256_maskz_cvtepu32_epi64(__mmask8 __U, __m128i __X) - { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_cvtepu32_epi64(__X), - (__v4di)_mm256_setzero_si256()); - } - - static __inline__ __m128i __DEFAULT_FN_ATTRS128 - _mm_mask_cvtepu16_epi32(__m128i __W, __mmask8 __U, __m128i __A) - { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_cvtepu16_epi32(__A), - (__v4si)__W); - } - - static __inline__ __m128i __DEFAULT_FN_ATTRS128 - _mm_maskz_cvtepu16_epi32(__mmask8 __U, __m128i __A) - { - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_cvtepu16_epi32(__A), - (__v4si)_mm_setzero_si128()); - } - - static __inline__ __m256i __DEFAULT_FN_ATTRS256 - _mm256_mask_cvtepu16_epi32(__m256i __W, __mmask8 __U, __m128i __A) - { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_cvtepu16_epi32(__A), - (__v8si)__W); - } - - static __inline__ __m256i __DEFAULT_FN_ATTRS256 - _mm256_maskz_cvtepu16_epi32(__mmask8 __U, __m128i __A) - { - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_cvtepu16_epi32(__A), - (__v8si)_mm256_setzero_si256()); - } - - static __inline__ __m128i __DEFAULT_FN_ATTRS128 - _mm_mask_cvtepu16_epi64(__m128i __W, __mmask8 __U, __m128i __A) - { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_cvtepu16_epi64(__A), - (__v2di)__W); - } - - static __inline__ __m128i __DEFAULT_FN_ATTRS128 - _mm_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A) - { - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_cvtepu16_epi64(__A), - (__v2di)_mm_setzero_si128()); - } - - static __inline__ __m256i __DEFAULT_FN_ATTRS256 - _mm256_mask_cvtepu16_epi64(__m256i __W, __mmask8 __U, __m128i __A) - { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_cvtepu16_epi64(__A), - (__v4di)__W); - } - - static __inline__ __m256i __DEFAULT_FN_ATTRS256 - _mm256_maskz_cvtepu16_epi64(__mmask8 __U, __m128i __A) - { - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_cvtepu16_epi64(__A), - (__v4di)_mm256_setzero_si256()); - } - - -#define _mm_rol_epi32(a, b) \ - ((__m128i)__builtin_ia32_prold128((__v4si)(__m128i)(a), (int)(b))) - -#define _mm_mask_rol_epi32(w, u, a, b) \ - ((__m128i)__builtin_ia32_selectd_128((__mmask8)(u), \ - (__v4si)_mm_rol_epi32((a), (b)), \ - (__v4si)(__m128i)(w))) - -#define _mm_maskz_rol_epi32(u, a, b) \ - ((__m128i)__builtin_ia32_selectd_128((__mmask8)(u), \ - (__v4si)_mm_rol_epi32((a), (b)), \ - (__v4si)_mm_setzero_si128())) - -#define _mm256_rol_epi32(a, b) \ - ((__m256i)__builtin_ia32_prold256((__v8si)(__m256i)(a), (int)(b))) - -#define _mm256_mask_rol_epi32(w, u, a, b) \ - ((__m256i)__builtin_ia32_selectd_256((__mmask8)(u), \ - (__v8si)_mm256_rol_epi32((a), (b)), \ - (__v8si)(__m256i)(w))) - -#define _mm256_maskz_rol_epi32(u, a, b) \ - ((__m256i)__builtin_ia32_selectd_256((__mmask8)(u), \ - (__v8si)_mm256_rol_epi32((a), (b)), \ - (__v8si)_mm256_setzero_si256())) - -#define _mm_rol_epi64(a, b) \ - ((__m128i)__builtin_ia32_prolq128((__v2di)(__m128i)(a), (int)(b))) - -#define _mm_mask_rol_epi64(w, u, a, b) \ - ((__m128i)__builtin_ia32_selectq_128((__mmask8)(u), \ - (__v2di)_mm_rol_epi64((a), (b)), \ - (__v2di)(__m128i)(w))) - -#define _mm_maskz_rol_epi64(u, a, b) \ - ((__m128i)__builtin_ia32_selectq_128((__mmask8)(u), \ - (__v2di)_mm_rol_epi64((a), (b)), \ - (__v2di)_mm_setzero_si128())) - -#define _mm256_rol_epi64(a, b) \ - ((__m256i)__builtin_ia32_prolq256((__v4di)(__m256i)(a), (int)(b))) - -#define _mm256_mask_rol_epi64(w, u, a, b) \ - ((__m256i)__builtin_ia32_selectq_256((__mmask8)(u), \ - (__v4di)_mm256_rol_epi64((a), (b)), \ - (__v4di)(__m256i)(w))) - -#define _mm256_maskz_rol_epi64(u, a, b) \ - ((__m256i)__builtin_ia32_selectq_256((__mmask8)(u), \ - (__v4di)_mm256_rol_epi64((a), (b)), \ - (__v4di)_mm256_setzero_si256())) - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_rolv_epi32 (__m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_prolvd128((__v4si)__A, (__v4si)__B); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_rolv_epi32 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectd_128(__U, - (__v4si)_mm_rolv_epi32(__A, __B), - (__v4si)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_rolv_epi32 (__mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectd_128(__U, - (__v4si)_mm_rolv_epi32(__A, __B), - (__v4si)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_rolv_epi32 (__m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_prolvd256((__v8si)__A, (__v8si)__B); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_rolv_epi32 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectd_256(__U, - (__v8si)_mm256_rolv_epi32(__A, __B), - (__v8si)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_rolv_epi32 (__mmask8 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectd_256(__U, - (__v8si)_mm256_rolv_epi32(__A, __B), - (__v8si)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_rolv_epi64 (__m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_prolvq128((__v2di)__A, (__v2di)__B); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_rolv_epi64 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectq_128(__U, - (__v2di)_mm_rolv_epi64(__A, __B), - (__v2di)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_rolv_epi64 (__mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectq_128(__U, - (__v2di)_mm_rolv_epi64(__A, __B), - (__v2di)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_rolv_epi64 (__m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_prolvq256((__v4di)__A, (__v4di)__B); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_rolv_epi64 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectq_256(__U, - (__v4di)_mm256_rolv_epi64(__A, __B), - (__v4di)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_rolv_epi64 (__mmask8 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectq_256(__U, - (__v4di)_mm256_rolv_epi64(__A, __B), - (__v4di)_mm256_setzero_si256()); -} - -#define _mm_ror_epi32(a, b) \ - ((__m128i)__builtin_ia32_prord128((__v4si)(__m128i)(a), (int)(b))) - -#define _mm_mask_ror_epi32(w, u, a, b) \ - ((__m128i)__builtin_ia32_selectd_128((__mmask8)(u), \ - (__v4si)_mm_ror_epi32((a), (b)), \ - (__v4si)(__m128i)(w))) - -#define _mm_maskz_ror_epi32(u, a, b) \ - ((__m128i)__builtin_ia32_selectd_128((__mmask8)(u), \ - (__v4si)_mm_ror_epi32((a), (b)), \ - (__v4si)_mm_setzero_si128())) - -#define _mm256_ror_epi32(a, b) \ - ((__m256i)__builtin_ia32_prord256((__v8si)(__m256i)(a), (int)(b))) - -#define _mm256_mask_ror_epi32(w, u, a, b) \ - ((__m256i)__builtin_ia32_selectd_256((__mmask8)(u), \ - (__v8si)_mm256_ror_epi32((a), (b)), \ - (__v8si)(__m256i)(w))) - -#define _mm256_maskz_ror_epi32(u, a, b) \ - ((__m256i)__builtin_ia32_selectd_256((__mmask8)(u), \ - (__v8si)_mm256_ror_epi32((a), (b)), \ - (__v8si)_mm256_setzero_si256())) - -#define _mm_ror_epi64(a, b) \ - ((__m128i)__builtin_ia32_prorq128((__v2di)(__m128i)(a), (int)(b))) - -#define _mm_mask_ror_epi64(w, u, a, b) \ - ((__m128i)__builtin_ia32_selectq_128((__mmask8)(u), \ - (__v2di)_mm_ror_epi64((a), (b)), \ - (__v2di)(__m128i)(w))) - -#define _mm_maskz_ror_epi64(u, a, b) \ - ((__m128i)__builtin_ia32_selectq_128((__mmask8)(u), \ - (__v2di)_mm_ror_epi64((a), (b)), \ - (__v2di)_mm_setzero_si128())) - -#define _mm256_ror_epi64(a, b) \ - ((__m256i)__builtin_ia32_prorq256((__v4di)(__m256i)(a), (int)(b))) - -#define _mm256_mask_ror_epi64(w, u, a, b) \ - ((__m256i)__builtin_ia32_selectq_256((__mmask8)(u), \ - (__v4di)_mm256_ror_epi64((a), (b)), \ - (__v4di)(__m256i)(w))) - -#define _mm256_maskz_ror_epi64(u, a, b) \ - ((__m256i)__builtin_ia32_selectq_256((__mmask8)(u), \ - (__v4di)_mm256_ror_epi64((a), (b)), \ - (__v4di)_mm256_setzero_si256())) - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_sll_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_sll_epi32(__A, __B), - (__v4si)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_sll_epi32(__mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_sll_epi32(__A, __B), - (__v4si)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_sll_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) -{ - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_sll_epi32(__A, __B), - (__v8si)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_sll_epi32(__mmask8 __U, __m256i __A, __m128i __B) -{ - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_sll_epi32(__A, __B), - (__v8si)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_slli_epi32(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B) -{ - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_slli_epi32(__A, __B), - (__v4si)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_slli_epi32(__mmask8 __U, __m128i __A, unsigned int __B) -{ - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_slli_epi32(__A, __B), - (__v4si)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_slli_epi32(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B) -{ - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_slli_epi32(__A, __B), - (__v8si)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_slli_epi32(__mmask8 __U, __m256i __A, unsigned int __B) -{ - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_slli_epi32(__A, __B), - (__v8si)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_sll_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_sll_epi64(__A, __B), - (__v2di)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_sll_epi64(__mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_sll_epi64(__A, __B), - (__v2di)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_sll_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) -{ - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_sll_epi64(__A, __B), - (__v4di)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_sll_epi64(__mmask8 __U, __m256i __A, __m128i __B) -{ - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_sll_epi64(__A, __B), - (__v4di)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_slli_epi64(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B) -{ - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_slli_epi64(__A, __B), - (__v2di)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_slli_epi64(__mmask8 __U, __m128i __A, unsigned int __B) -{ - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_slli_epi64(__A, __B), - (__v2di)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_slli_epi64(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B) -{ - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_slli_epi64(__A, __B), - (__v4di)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_slli_epi64(__mmask8 __U, __m256i __A, unsigned int __B) -{ - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_slli_epi64(__A, __B), - (__v4di)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_rorv_epi32 (__m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_prorvd128((__v4si)__A, (__v4si)__B); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_rorv_epi32 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectd_128(__U, - (__v4si)_mm_rorv_epi32(__A, __B), - (__v4si)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_rorv_epi32 (__mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectd_128(__U, - (__v4si)_mm_rorv_epi32(__A, __B), - (__v4si)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_rorv_epi32 (__m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_prorvd256((__v8si)__A, (__v8si)__B); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_rorv_epi32 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectd_256(__U, - (__v8si)_mm256_rorv_epi32(__A, __B), - (__v8si)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_rorv_epi32 (__mmask8 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectd_256(__U, - (__v8si)_mm256_rorv_epi32(__A, __B), - (__v8si)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_rorv_epi64 (__m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_prorvq128((__v2di)__A, (__v2di)__B); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_rorv_epi64 (__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectq_128(__U, - (__v2di)_mm_rorv_epi64(__A, __B), - (__v2di)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_rorv_epi64 (__mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectq_128(__U, - (__v2di)_mm_rorv_epi64(__A, __B), - (__v2di)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_rorv_epi64 (__m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_prorvq256((__v4di)__A, (__v4di)__B); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_rorv_epi64 (__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectq_256(__U, - (__v4di)_mm256_rorv_epi64(__A, __B), - (__v4di)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_rorv_epi64 (__mmask8 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectq_256(__U, - (__v4di)_mm256_rorv_epi64(__A, __B), - (__v4di)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_sllv_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) -{ - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_sllv_epi64(__X, __Y), - (__v2di)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_sllv_epi64(__mmask8 __U, __m128i __X, __m128i __Y) -{ - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_sllv_epi64(__X, __Y), - (__v2di)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_sllv_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) -{ - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_sllv_epi64(__X, __Y), - (__v4di)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_sllv_epi64(__mmask8 __U, __m256i __X, __m256i __Y) -{ - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_sllv_epi64(__X, __Y), - (__v4di)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_sllv_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) -{ - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_sllv_epi32(__X, __Y), - (__v4si)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_sllv_epi32(__mmask8 __U, __m128i __X, __m128i __Y) -{ - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_sllv_epi32(__X, __Y), - (__v4si)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_sllv_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) -{ - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_sllv_epi32(__X, __Y), - (__v8si)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_sllv_epi32(__mmask8 __U, __m256i __X, __m256i __Y) -{ - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_sllv_epi32(__X, __Y), - (__v8si)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_srlv_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) -{ - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_srlv_epi64(__X, __Y), - (__v2di)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_srlv_epi64(__mmask8 __U, __m128i __X, __m128i __Y) -{ - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_srlv_epi64(__X, __Y), - (__v2di)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_srlv_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) -{ - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_srlv_epi64(__X, __Y), - (__v4di)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_srlv_epi64(__mmask8 __U, __m256i __X, __m256i __Y) -{ - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_srlv_epi64(__X, __Y), - (__v4di)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_srlv_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) -{ - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_srlv_epi32(__X, __Y), - (__v4si)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_srlv_epi32(__mmask8 __U, __m128i __X, __m128i __Y) -{ - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_srlv_epi32(__X, __Y), - (__v4si)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_srlv_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) -{ - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_srlv_epi32(__X, __Y), - (__v8si)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_srlv_epi32(__mmask8 __U, __m256i __X, __m256i __Y) -{ - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_srlv_epi32(__X, __Y), - (__v8si)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_srl_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_srl_epi32(__A, __B), - (__v4si)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_srl_epi32(__mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_srl_epi32(__A, __B), - (__v4si)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_srl_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) -{ - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_srl_epi32(__A, __B), - (__v8si)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_srl_epi32(__mmask8 __U, __m256i __A, __m128i __B) -{ - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_srl_epi32(__A, __B), - (__v8si)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_srli_epi32(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B) -{ - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_srli_epi32(__A, __B), - (__v4si)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_srli_epi32(__mmask8 __U, __m128i __A, unsigned int __B) -{ - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_srli_epi32(__A, __B), - (__v4si)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_srli_epi32(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B) -{ - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_srli_epi32(__A, __B), - (__v8si)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_srli_epi32(__mmask8 __U, __m256i __A, unsigned int __B) -{ - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_srli_epi32(__A, __B), - (__v8si)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_srl_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_srl_epi64(__A, __B), - (__v2di)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_srl_epi64(__mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_srl_epi64(__A, __B), - (__v2di)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_srl_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) -{ - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_srl_epi64(__A, __B), - (__v4di)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_srl_epi64(__mmask8 __U, __m256i __A, __m128i __B) -{ - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_srl_epi64(__A, __B), - (__v4di)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_srli_epi64(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B) -{ - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_srli_epi64(__A, __B), - (__v2di)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_srli_epi64(__mmask8 __U, __m128i __A, unsigned int __B) -{ - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_srli_epi64(__A, __B), - (__v2di)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_srli_epi64(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B) -{ - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_srli_epi64(__A, __B), - (__v4di)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_srli_epi64(__mmask8 __U, __m256i __A, unsigned int __B) -{ - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_srli_epi64(__A, __B), - (__v4di)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_srav_epi32(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) -{ - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_srav_epi32(__X, __Y), - (__v4si)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_srav_epi32(__mmask8 __U, __m128i __X, __m128i __Y) -{ - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_srav_epi32(__X, __Y), - (__v4si)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_srav_epi32(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) -{ - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_srav_epi32(__X, __Y), - (__v8si)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_srav_epi32(__mmask8 __U, __m256i __X, __m256i __Y) -{ - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_srav_epi32(__X, __Y), - (__v8si)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_srav_epi64(__m128i __X, __m128i __Y) -{ - return (__m128i)__builtin_ia32_psravq128((__v2di)__X, (__v2di)__Y); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_srav_epi64(__m128i __W, __mmask8 __U, __m128i __X, __m128i __Y) -{ - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_srav_epi64(__X, __Y), - (__v2di)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_srav_epi64(__mmask8 __U, __m128i __X, __m128i __Y) -{ - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_srav_epi64(__X, __Y), - (__v2di)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_srav_epi64(__m256i __X, __m256i __Y) -{ - return (__m256i)__builtin_ia32_psravq256((__v4di)__X, (__v4di) __Y); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_srav_epi64(__m256i __W, __mmask8 __U, __m256i __X, __m256i __Y) -{ - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_srav_epi64(__X, __Y), - (__v4di)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_srav_epi64 (__mmask8 __U, __m256i __X, __m256i __Y) -{ - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_srav_epi64(__X, __Y), - (__v4di)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_mov_epi32 (__m128i __W, __mmask8 __U, __m128i __A) -{ - return (__m128i) __builtin_ia32_selectd_128 ((__mmask8) __U, - (__v4si) __A, - (__v4si) __W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_mov_epi32 (__mmask8 __U, __m128i __A) -{ - return (__m128i) __builtin_ia32_selectd_128 ((__mmask8) __U, - (__v4si) __A, - (__v4si) _mm_setzero_si128 ()); -} - - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_mov_epi32 (__m256i __W, __mmask8 __U, __m256i __A) -{ - return (__m256i) __builtin_ia32_selectd_256 ((__mmask8) __U, - (__v8si) __A, - (__v8si) __W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_mov_epi32 (__mmask8 __U, __m256i __A) -{ - return (__m256i) __builtin_ia32_selectd_256 ((__mmask8) __U, - (__v8si) __A, - (__v8si) _mm256_setzero_si256 ()); -} - -static __inline __m128i __DEFAULT_FN_ATTRS128 -_mm_load_epi32 (void const *__P) -{ - return *(const __m128i *) __P; -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_load_epi32 (__m128i __W, __mmask8 __U, void const *__P) -{ - return (__m128i) __builtin_ia32_movdqa32load128_mask ((const __v4si *) __P, - (__v4si) __W, - (__mmask8) - __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_load_epi32 (__mmask8 __U, void const *__P) -{ - return (__m128i) __builtin_ia32_movdqa32load128_mask ((const __v4si *) __P, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) - __U); -} - -static __inline __m256i __DEFAULT_FN_ATTRS256 -_mm256_load_epi32 (void const *__P) -{ - return *(const __m256i *) __P; -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_load_epi32 (__m256i __W, __mmask8 __U, void const *__P) -{ - return (__m256i) __builtin_ia32_movdqa32load256_mask ((const __v8si *) __P, - (__v8si) __W, - (__mmask8) - __U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_load_epi32 (__mmask8 __U, void const *__P) -{ - return (__m256i) __builtin_ia32_movdqa32load256_mask ((const __v8si *) __P, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) - __U); -} - -static __inline void __DEFAULT_FN_ATTRS128 -_mm_store_epi32 (void *__P, __m128i __A) -{ - *(__m128i *) __P = __A; -} - -static __inline__ void __DEFAULT_FN_ATTRS128 -_mm_mask_store_epi32 (void *__P, __mmask8 __U, __m128i __A) -{ - __builtin_ia32_movdqa32store128_mask ((__v4si *) __P, - (__v4si) __A, - (__mmask8) __U); -} - -static __inline void __DEFAULT_FN_ATTRS256 -_mm256_store_epi32 (void *__P, __m256i __A) -{ - *(__m256i *) __P = __A; -} - -static __inline__ void __DEFAULT_FN_ATTRS256 -_mm256_mask_store_epi32 (void *__P, __mmask8 __U, __m256i __A) -{ - __builtin_ia32_movdqa32store256_mask ((__v8si *) __P, - (__v8si) __A, - (__mmask8) __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_mov_epi64 (__m128i __W, __mmask8 __U, __m128i __A) -{ - return (__m128i) __builtin_ia32_selectq_128 ((__mmask8) __U, - (__v2di) __A, - (__v2di) __W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_mov_epi64 (__mmask8 __U, __m128i __A) -{ - return (__m128i) __builtin_ia32_selectq_128 ((__mmask8) __U, - (__v2di) __A, - (__v2di) _mm_setzero_si128 ()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_mov_epi64 (__m256i __W, __mmask8 __U, __m256i __A) -{ - return (__m256i) __builtin_ia32_selectq_256 ((__mmask8) __U, - (__v4di) __A, - (__v4di) __W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_mov_epi64 (__mmask8 __U, __m256i __A) -{ - return (__m256i) __builtin_ia32_selectq_256 ((__mmask8) __U, - (__v4di) __A, - (__v4di) _mm256_setzero_si256 ()); -} - -static __inline __m128i __DEFAULT_FN_ATTRS128 -_mm_load_epi64 (void const *__P) -{ - return *(const __m128i *) __P; -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_load_epi64 (__m128i __W, __mmask8 __U, void const *__P) -{ - return (__m128i) __builtin_ia32_movdqa64load128_mask ((const __v2di *) __P, - (__v2di) __W, - (__mmask8) - __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_load_epi64 (__mmask8 __U, void const *__P) -{ - return (__m128i) __builtin_ia32_movdqa64load128_mask ((const __v2di *) __P, - (__v2di) - _mm_setzero_si128 (), - (__mmask8) - __U); -} - -static __inline __m256i __DEFAULT_FN_ATTRS256 -_mm256_load_epi64 (void const *__P) -{ - return *(const __m256i *) __P; -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_load_epi64 (__m256i __W, __mmask8 __U, void const *__P) -{ - return (__m256i) __builtin_ia32_movdqa64load256_mask ((const __v4di *) __P, - (__v4di) __W, - (__mmask8) - __U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_load_epi64 (__mmask8 __U, void const *__P) -{ - return (__m256i) __builtin_ia32_movdqa64load256_mask ((const __v4di *) __P, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) - __U); -} - -static __inline void __DEFAULT_FN_ATTRS128 -_mm_store_epi64 (void *__P, __m128i __A) -{ - *(__m128i *) __P = __A; -} - -static __inline__ void __DEFAULT_FN_ATTRS128 -_mm_mask_store_epi64 (void *__P, __mmask8 __U, __m128i __A) -{ - __builtin_ia32_movdqa64store128_mask ((__v2di *) __P, - (__v2di) __A, - (__mmask8) __U); -} - -static __inline void __DEFAULT_FN_ATTRS256 -_mm256_store_epi64 (void *__P, __m256i __A) -{ - *(__m256i *) __P = __A; -} - -static __inline__ void __DEFAULT_FN_ATTRS256 -_mm256_mask_store_epi64 (void *__P, __mmask8 __U, __m256i __A) -{ - __builtin_ia32_movdqa64store256_mask ((__v4di *) __P, - (__v4di) __A, - (__mmask8) __U); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_movedup_pd (__m128d __W, __mmask8 __U, __m128d __A) -{ - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_movedup_pd(__A), - (__v2df)__W); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_movedup_pd (__mmask8 __U, __m128d __A) -{ - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_movedup_pd(__A), - (__v2df)_mm_setzero_pd()); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_mask_movedup_pd (__m256d __W, __mmask8 __U, __m256d __A) -{ - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_movedup_pd(__A), - (__v4df)__W); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_maskz_movedup_pd (__mmask8 __U, __m256d __A) -{ - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_movedup_pd(__A), - (__v4df)_mm256_setzero_pd()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_set1_epi32(__m128i __O, __mmask8 __M, int __A) -{ - return (__m128i)__builtin_ia32_selectd_128(__M, - (__v4si) _mm_set1_epi32(__A), - (__v4si)__O); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_set1_epi32( __mmask8 __M, int __A) -{ - return (__m128i)__builtin_ia32_selectd_128(__M, - (__v4si) _mm_set1_epi32(__A), - (__v4si)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_set1_epi32(__m256i __O, __mmask8 __M, int __A) -{ - return (__m256i)__builtin_ia32_selectd_256(__M, - (__v8si) _mm256_set1_epi32(__A), - (__v8si)__O); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_set1_epi32( __mmask8 __M, int __A) -{ - return (__m256i)__builtin_ia32_selectd_256(__M, - (__v8si) _mm256_set1_epi32(__A), - (__v8si)_mm256_setzero_si256()); -} - - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_set1_epi64 (__m128i __O, __mmask8 __M, long long __A) -{ - return (__m128i) __builtin_ia32_selectq_128(__M, - (__v2di) _mm_set1_epi64x(__A), - (__v2di) __O); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_set1_epi64 (__mmask8 __M, long long __A) -{ - return (__m128i) __builtin_ia32_selectq_128(__M, - (__v2di) _mm_set1_epi64x(__A), - (__v2di) _mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_set1_epi64 (__m256i __O, __mmask8 __M, long long __A) -{ - return (__m256i) __builtin_ia32_selectq_256(__M, - (__v4di) _mm256_set1_epi64x(__A), - (__v4di) __O) ; -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_set1_epi64 (__mmask8 __M, long long __A) -{ - return (__m256i) __builtin_ia32_selectq_256(__M, - (__v4di) _mm256_set1_epi64x(__A), - (__v4di) _mm256_setzero_si256()); -} - -#define _mm_fixupimm_pd(A, B, C, imm) \ - ((__m128d)__builtin_ia32_fixupimmpd128_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2di)(__m128i)(C), (int)(imm), \ - (__mmask8)-1)) - -#define _mm_mask_fixupimm_pd(A, U, B, C, imm) \ - ((__m128d)__builtin_ia32_fixupimmpd128_mask((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2di)(__m128i)(C), (int)(imm), \ - (__mmask8)(U))) - -#define _mm_maskz_fixupimm_pd(U, A, B, C, imm) \ - ((__m128d)__builtin_ia32_fixupimmpd128_maskz((__v2df)(__m128d)(A), \ - (__v2df)(__m128d)(B), \ - (__v2di)(__m128i)(C), \ - (int)(imm), (__mmask8)(U))) - -#define _mm256_fixupimm_pd(A, B, C, imm) \ - ((__m256d)__builtin_ia32_fixupimmpd256_mask((__v4df)(__m256d)(A), \ - (__v4df)(__m256d)(B), \ - (__v4di)(__m256i)(C), (int)(imm), \ - (__mmask8)-1)) - -#define _mm256_mask_fixupimm_pd(A, U, B, C, imm) \ - ((__m256d)__builtin_ia32_fixupimmpd256_mask((__v4df)(__m256d)(A), \ - (__v4df)(__m256d)(B), \ - (__v4di)(__m256i)(C), (int)(imm), \ - (__mmask8)(U))) - -#define _mm256_maskz_fixupimm_pd(U, A, B, C, imm) \ - ((__m256d)__builtin_ia32_fixupimmpd256_maskz((__v4df)(__m256d)(A), \ - (__v4df)(__m256d)(B), \ - (__v4di)(__m256i)(C), \ - (int)(imm), (__mmask8)(U))) - -#define _mm_fixupimm_ps(A, B, C, imm) \ - ((__m128)__builtin_ia32_fixupimmps128_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4si)(__m128i)(C), (int)(imm), \ - (__mmask8)-1)) - -#define _mm_mask_fixupimm_ps(A, U, B, C, imm) \ - ((__m128)__builtin_ia32_fixupimmps128_mask((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4si)(__m128i)(C), (int)(imm), \ - (__mmask8)(U))) - -#define _mm_maskz_fixupimm_ps(U, A, B, C, imm) \ - ((__m128)__builtin_ia32_fixupimmps128_maskz((__v4sf)(__m128)(A), \ - (__v4sf)(__m128)(B), \ - (__v4si)(__m128i)(C), (int)(imm), \ - (__mmask8)(U))) - -#define _mm256_fixupimm_ps(A, B, C, imm) \ - ((__m256)__builtin_ia32_fixupimmps256_mask((__v8sf)(__m256)(A), \ - (__v8sf)(__m256)(B), \ - (__v8si)(__m256i)(C), (int)(imm), \ - (__mmask8)-1)) - -#define _mm256_mask_fixupimm_ps(A, U, B, C, imm) \ - ((__m256)__builtin_ia32_fixupimmps256_mask((__v8sf)(__m256)(A), \ - (__v8sf)(__m256)(B), \ - (__v8si)(__m256i)(C), (int)(imm), \ - (__mmask8)(U))) - -#define _mm256_maskz_fixupimm_ps(U, A, B, C, imm) \ - ((__m256)__builtin_ia32_fixupimmps256_maskz((__v8sf)(__m256)(A), \ - (__v8sf)(__m256)(B), \ - (__v8si)(__m256i)(C), (int)(imm), \ - (__mmask8)(U))) - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_load_pd (__m128d __W, __mmask8 __U, void const *__P) -{ - return (__m128d) __builtin_ia32_loadapd128_mask ((const __v2df *) __P, - (__v2df) __W, - (__mmask8) __U); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_load_pd (__mmask8 __U, void const *__P) -{ - return (__m128d) __builtin_ia32_loadapd128_mask ((const __v2df *) __P, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_mask_load_pd (__m256d __W, __mmask8 __U, void const *__P) -{ - return (__m256d) __builtin_ia32_loadapd256_mask ((const __v4df *) __P, - (__v4df) __W, - (__mmask8) __U); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_maskz_load_pd (__mmask8 __U, void const *__P) -{ - return (__m256d) __builtin_ia32_loadapd256_mask ((const __v4df *) __P, - (__v4df) - _mm256_setzero_pd (), - (__mmask8) __U); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_load_ps (__m128 __W, __mmask8 __U, void const *__P) -{ - return (__m128) __builtin_ia32_loadaps128_mask ((const __v4sf *) __P, - (__v4sf) __W, - (__mmask8) __U); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_load_ps (__mmask8 __U, void const *__P) -{ - return (__m128) __builtin_ia32_loadaps128_mask ((const __v4sf *) __P, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_mask_load_ps (__m256 __W, __mmask8 __U, void const *__P) -{ - return (__m256) __builtin_ia32_loadaps256_mask ((const __v8sf *) __P, - (__v8sf) __W, - (__mmask8) __U); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_maskz_load_ps (__mmask8 __U, void const *__P) -{ - return (__m256) __builtin_ia32_loadaps256_mask ((const __v8sf *) __P, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) __U); -} - -static __inline __m128i __DEFAULT_FN_ATTRS128 -_mm_loadu_epi64 (void const *__P) -{ - struct __loadu_epi64 { - __m128i_u __v; - } __attribute__((__packed__, __may_alias__)); - return ((const struct __loadu_epi64*)__P)->__v; -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_loadu_epi64 (__m128i __W, __mmask8 __U, void const *__P) -{ - return (__m128i) __builtin_ia32_loaddqudi128_mask ((const __v2di *) __P, - (__v2di) __W, - (__mmask8) __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_loadu_epi64 (__mmask8 __U, void const *__P) -{ - return (__m128i) __builtin_ia32_loaddqudi128_mask ((const __v2di *) __P, - (__v2di) - _mm_setzero_si128 (), - (__mmask8) __U); -} - -static __inline __m256i __DEFAULT_FN_ATTRS256 -_mm256_loadu_epi64 (void const *__P) -{ - struct __loadu_epi64 { - __m256i_u __v; - } __attribute__((__packed__, __may_alias__)); - return ((const struct __loadu_epi64*)__P)->__v; -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_loadu_epi64 (__m256i __W, __mmask8 __U, void const *__P) -{ - return (__m256i) __builtin_ia32_loaddqudi256_mask ((const __v4di *) __P, - (__v4di) __W, - (__mmask8) __U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_loadu_epi64 (__mmask8 __U, void const *__P) -{ - return (__m256i) __builtin_ia32_loaddqudi256_mask ((const __v4di *) __P, - (__v4di) - _mm256_setzero_si256 (), - (__mmask8) __U); -} - -static __inline __m128i __DEFAULT_FN_ATTRS128 -_mm_loadu_epi32 (void const *__P) -{ - struct __loadu_epi32 { - __m128i_u __v; - } __attribute__((__packed__, __may_alias__)); - return ((const struct __loadu_epi32*)__P)->__v; -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_loadu_epi32 (__m128i __W, __mmask8 __U, void const *__P) -{ - return (__m128i) __builtin_ia32_loaddqusi128_mask ((const __v4si *) __P, - (__v4si) __W, - (__mmask8) __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_loadu_epi32 (__mmask8 __U, void const *__P) -{ - return (__m128i) __builtin_ia32_loaddqusi128_mask ((const __v4si *) __P, - (__v4si) - _mm_setzero_si128 (), - (__mmask8) __U); -} - -static __inline __m256i __DEFAULT_FN_ATTRS256 -_mm256_loadu_epi32 (void const *__P) -{ - struct __loadu_epi32 { - __m256i_u __v; - } __attribute__((__packed__, __may_alias__)); - return ((const struct __loadu_epi32*)__P)->__v; -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_loadu_epi32 (__m256i __W, __mmask8 __U, void const *__P) -{ - return (__m256i) __builtin_ia32_loaddqusi256_mask ((const __v8si *) __P, - (__v8si) __W, - (__mmask8) __U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_loadu_epi32 (__mmask8 __U, void const *__P) -{ - return (__m256i) __builtin_ia32_loaddqusi256_mask ((const __v8si *) __P, - (__v8si) - _mm256_setzero_si256 (), - (__mmask8) __U); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_loadu_pd (__m128d __W, __mmask8 __U, void const *__P) -{ - return (__m128d) __builtin_ia32_loadupd128_mask ((const __v2df *) __P, - (__v2df) __W, - (__mmask8) __U); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_loadu_pd (__mmask8 __U, void const *__P) -{ - return (__m128d) __builtin_ia32_loadupd128_mask ((const __v2df *) __P, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_mask_loadu_pd (__m256d __W, __mmask8 __U, void const *__P) -{ - return (__m256d) __builtin_ia32_loadupd256_mask ((const __v4df *) __P, - (__v4df) __W, - (__mmask8) __U); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_maskz_loadu_pd (__mmask8 __U, void const *__P) -{ - return (__m256d) __builtin_ia32_loadupd256_mask ((const __v4df *) __P, - (__v4df) - _mm256_setzero_pd (), - (__mmask8) __U); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_loadu_ps (__m128 __W, __mmask8 __U, void const *__P) -{ - return (__m128) __builtin_ia32_loadups128_mask ((const __v4sf *) __P, - (__v4sf) __W, - (__mmask8) __U); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_loadu_ps (__mmask8 __U, void const *__P) -{ - return (__m128) __builtin_ia32_loadups128_mask ((const __v4sf *) __P, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_mask_loadu_ps (__m256 __W, __mmask8 __U, void const *__P) -{ - return (__m256) __builtin_ia32_loadups256_mask ((const __v8sf *) __P, - (__v8sf) __W, - (__mmask8) __U); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_maskz_loadu_ps (__mmask8 __U, void const *__P) -{ - return (__m256) __builtin_ia32_loadups256_mask ((const __v8sf *) __P, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) __U); -} - -static __inline__ void __DEFAULT_FN_ATTRS128 -_mm_mask_store_pd (void *__P, __mmask8 __U, __m128d __A) -{ - __builtin_ia32_storeapd128_mask ((__v2df *) __P, - (__v2df) __A, - (__mmask8) __U); -} - -static __inline__ void __DEFAULT_FN_ATTRS256 -_mm256_mask_store_pd (void *__P, __mmask8 __U, __m256d __A) -{ - __builtin_ia32_storeapd256_mask ((__v4df *) __P, - (__v4df) __A, - (__mmask8) __U); -} - -static __inline__ void __DEFAULT_FN_ATTRS128 -_mm_mask_store_ps (void *__P, __mmask8 __U, __m128 __A) -{ - __builtin_ia32_storeaps128_mask ((__v4sf *) __P, - (__v4sf) __A, - (__mmask8) __U); -} - -static __inline__ void __DEFAULT_FN_ATTRS256 -_mm256_mask_store_ps (void *__P, __mmask8 __U, __m256 __A) -{ - __builtin_ia32_storeaps256_mask ((__v8sf *) __P, - (__v8sf) __A, - (__mmask8) __U); -} - -static __inline void __DEFAULT_FN_ATTRS128 -_mm_storeu_epi64 (void *__P, __m128i __A) -{ - struct __storeu_epi64 { - __m128i_u __v; - } __attribute__((__packed__, __may_alias__)); - ((struct __storeu_epi64*)__P)->__v = __A; -} - -static __inline__ void __DEFAULT_FN_ATTRS128 -_mm_mask_storeu_epi64 (void *__P, __mmask8 __U, __m128i __A) -{ - __builtin_ia32_storedqudi128_mask ((__v2di *) __P, - (__v2di) __A, - (__mmask8) __U); -} - -static __inline void __DEFAULT_FN_ATTRS256 -_mm256_storeu_epi64 (void *__P, __m256i __A) -{ - struct __storeu_epi64 { - __m256i_u __v; - } __attribute__((__packed__, __may_alias__)); - ((struct __storeu_epi64*)__P)->__v = __A; -} - -static __inline__ void __DEFAULT_FN_ATTRS256 -_mm256_mask_storeu_epi64 (void *__P, __mmask8 __U, __m256i __A) -{ - __builtin_ia32_storedqudi256_mask ((__v4di *) __P, - (__v4di) __A, - (__mmask8) __U); -} - -static __inline void __DEFAULT_FN_ATTRS128 -_mm_storeu_epi32 (void *__P, __m128i __A) -{ - struct __storeu_epi32 { - __m128i_u __v; - } __attribute__((__packed__, __may_alias__)); - ((struct __storeu_epi32*)__P)->__v = __A; -} - -static __inline__ void __DEFAULT_FN_ATTRS128 -_mm_mask_storeu_epi32 (void *__P, __mmask8 __U, __m128i __A) -{ - __builtin_ia32_storedqusi128_mask ((__v4si *) __P, - (__v4si) __A, - (__mmask8) __U); -} - -static __inline void __DEFAULT_FN_ATTRS256 -_mm256_storeu_epi32 (void *__P, __m256i __A) -{ - struct __storeu_epi32 { - __m256i_u __v; - } __attribute__((__packed__, __may_alias__)); - ((struct __storeu_epi32*)__P)->__v = __A; -} - -static __inline__ void __DEFAULT_FN_ATTRS256 -_mm256_mask_storeu_epi32 (void *__P, __mmask8 __U, __m256i __A) -{ - __builtin_ia32_storedqusi256_mask ((__v8si *) __P, - (__v8si) __A, - (__mmask8) __U); -} - -static __inline__ void __DEFAULT_FN_ATTRS128 -_mm_mask_storeu_pd (void *__P, __mmask8 __U, __m128d __A) -{ - __builtin_ia32_storeupd128_mask ((__v2df *) __P, - (__v2df) __A, - (__mmask8) __U); -} - -static __inline__ void __DEFAULT_FN_ATTRS256 -_mm256_mask_storeu_pd (void *__P, __mmask8 __U, __m256d __A) -{ - __builtin_ia32_storeupd256_mask ((__v4df *) __P, - (__v4df) __A, - (__mmask8) __U); -} - -static __inline__ void __DEFAULT_FN_ATTRS128 -_mm_mask_storeu_ps (void *__P, __mmask8 __U, __m128 __A) -{ - __builtin_ia32_storeups128_mask ((__v4sf *) __P, - (__v4sf) __A, - (__mmask8) __U); -} - -static __inline__ void __DEFAULT_FN_ATTRS256 -_mm256_mask_storeu_ps (void *__P, __mmask8 __U, __m256 __A) -{ - __builtin_ia32_storeups256_mask ((__v8sf *) __P, - (__v8sf) __A, - (__mmask8) __U); -} - - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_unpackhi_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) -{ - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_unpackhi_pd(__A, __B), - (__v2df)__W); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_unpackhi_pd(__mmask8 __U, __m128d __A, __m128d __B) -{ - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_unpackhi_pd(__A, __B), - (__v2df)_mm_setzero_pd()); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_mask_unpackhi_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) -{ - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_unpackhi_pd(__A, __B), - (__v4df)__W); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_maskz_unpackhi_pd(__mmask8 __U, __m256d __A, __m256d __B) -{ - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_unpackhi_pd(__A, __B), - (__v4df)_mm256_setzero_pd()); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_unpackhi_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) -{ - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_unpackhi_ps(__A, __B), - (__v4sf)__W); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_unpackhi_ps(__mmask8 __U, __m128 __A, __m128 __B) -{ - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_unpackhi_ps(__A, __B), - (__v4sf)_mm_setzero_ps()); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_mask_unpackhi_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) -{ - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_unpackhi_ps(__A, __B), - (__v8sf)__W); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_maskz_unpackhi_ps(__mmask8 __U, __m256 __A, __m256 __B) -{ - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_unpackhi_ps(__A, __B), - (__v8sf)_mm256_setzero_ps()); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_unpacklo_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128d __B) -{ - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_unpacklo_pd(__A, __B), - (__v2df)__W); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_unpacklo_pd(__mmask8 __U, __m128d __A, __m128d __B) -{ - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_unpacklo_pd(__A, __B), - (__v2df)_mm_setzero_pd()); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_mask_unpacklo_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256d __B) -{ - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_unpacklo_pd(__A, __B), - (__v4df)__W); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_maskz_unpacklo_pd(__mmask8 __U, __m256d __A, __m256d __B) -{ - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_unpacklo_pd(__A, __B), - (__v4df)_mm256_setzero_pd()); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_unpacklo_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128 __B) -{ - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_unpacklo_ps(__A, __B), - (__v4sf)__W); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_unpacklo_ps(__mmask8 __U, __m128 __A, __m128 __B) -{ - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_unpacklo_ps(__A, __B), - (__v4sf)_mm_setzero_ps()); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_mask_unpacklo_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256 __B) -{ - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_unpacklo_ps(__A, __B), - (__v8sf)__W); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_maskz_unpacklo_ps(__mmask8 __U, __m256 __A, __m256 __B) -{ - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_unpacklo_ps(__A, __B), - (__v8sf)_mm256_setzero_ps()); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_rcp14_pd (__m128d __A) -{ - return (__m128d) __builtin_ia32_rcp14pd128_mask ((__v2df) __A, - (__v2df) - _mm_setzero_pd (), - (__mmask8) -1); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_rcp14_pd (__m128d __W, __mmask8 __U, __m128d __A) -{ - return (__m128d) __builtin_ia32_rcp14pd128_mask ((__v2df) __A, - (__v2df) __W, - (__mmask8) __U); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_rcp14_pd (__mmask8 __U, __m128d __A) -{ - return (__m128d) __builtin_ia32_rcp14pd128_mask ((__v2df) __A, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_rcp14_pd (__m256d __A) -{ - return (__m256d) __builtin_ia32_rcp14pd256_mask ((__v4df) __A, - (__v4df) - _mm256_setzero_pd (), - (__mmask8) -1); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_mask_rcp14_pd (__m256d __W, __mmask8 __U, __m256d __A) -{ - return (__m256d) __builtin_ia32_rcp14pd256_mask ((__v4df) __A, - (__v4df) __W, - (__mmask8) __U); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_maskz_rcp14_pd (__mmask8 __U, __m256d __A) -{ - return (__m256d) __builtin_ia32_rcp14pd256_mask ((__v4df) __A, - (__v4df) - _mm256_setzero_pd (), - (__mmask8) __U); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_rcp14_ps (__m128 __A) -{ - return (__m128) __builtin_ia32_rcp14ps128_mask ((__v4sf) __A, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) -1); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_rcp14_ps (__m128 __W, __mmask8 __U, __m128 __A) -{ - return (__m128) __builtin_ia32_rcp14ps128_mask ((__v4sf) __A, - (__v4sf) __W, - (__mmask8) __U); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_rcp14_ps (__mmask8 __U, __m128 __A) -{ - return (__m128) __builtin_ia32_rcp14ps128_mask ((__v4sf) __A, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_rcp14_ps (__m256 __A) -{ - return (__m256) __builtin_ia32_rcp14ps256_mask ((__v8sf) __A, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) -1); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_mask_rcp14_ps (__m256 __W, __mmask8 __U, __m256 __A) -{ - return (__m256) __builtin_ia32_rcp14ps256_mask ((__v8sf) __A, - (__v8sf) __W, - (__mmask8) __U); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_maskz_rcp14_ps (__mmask8 __U, __m256 __A) -{ - return (__m256) __builtin_ia32_rcp14ps256_mask ((__v8sf) __A, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) __U); -} - -#define _mm_mask_permute_pd(W, U, X, C) \ - ((__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \ - (__v2df)_mm_permute_pd((X), (C)), \ - (__v2df)(__m128d)(W))) - -#define _mm_maskz_permute_pd(U, X, C) \ - ((__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \ - (__v2df)_mm_permute_pd((X), (C)), \ - (__v2df)_mm_setzero_pd())) - -#define _mm256_mask_permute_pd(W, U, X, C) \ - ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ - (__v4df)_mm256_permute_pd((X), (C)), \ - (__v4df)(__m256d)(W))) - -#define _mm256_maskz_permute_pd(U, X, C) \ - ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ - (__v4df)_mm256_permute_pd((X), (C)), \ - (__v4df)_mm256_setzero_pd())) - -#define _mm_mask_permute_ps(W, U, X, C) \ - ((__m128)__builtin_ia32_selectps_128((__mmask8)(U), \ - (__v4sf)_mm_permute_ps((X), (C)), \ - (__v4sf)(__m128)(W))) - -#define _mm_maskz_permute_ps(U, X, C) \ - ((__m128)__builtin_ia32_selectps_128((__mmask8)(U), \ - (__v4sf)_mm_permute_ps((X), (C)), \ - (__v4sf)_mm_setzero_ps())) - -#define _mm256_mask_permute_ps(W, U, X, C) \ - ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ - (__v8sf)_mm256_permute_ps((X), (C)), \ - (__v8sf)(__m256)(W))) - -#define _mm256_maskz_permute_ps(U, X, C) \ - ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ - (__v8sf)_mm256_permute_ps((X), (C)), \ - (__v8sf)_mm256_setzero_ps())) - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_permutevar_pd(__m128d __W, __mmask8 __U, __m128d __A, __m128i __C) -{ - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_permutevar_pd(__A, __C), - (__v2df)__W); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_permutevar_pd(__mmask8 __U, __m128d __A, __m128i __C) -{ - return (__m128d)__builtin_ia32_selectpd_128((__mmask8)__U, - (__v2df)_mm_permutevar_pd(__A, __C), - (__v2df)_mm_setzero_pd()); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_mask_permutevar_pd(__m256d __W, __mmask8 __U, __m256d __A, __m256i __C) -{ - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_permutevar_pd(__A, __C), - (__v4df)__W); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_maskz_permutevar_pd(__mmask8 __U, __m256d __A, __m256i __C) -{ - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_permutevar_pd(__A, __C), - (__v4df)_mm256_setzero_pd()); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_permutevar_ps(__m128 __W, __mmask8 __U, __m128 __A, __m128i __C) -{ - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_permutevar_ps(__A, __C), - (__v4sf)__W); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_permutevar_ps(__mmask8 __U, __m128 __A, __m128i __C) -{ - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_permutevar_ps(__A, __C), - (__v4sf)_mm_setzero_ps()); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_mask_permutevar_ps(__m256 __W, __mmask8 __U, __m256 __A, __m256i __C) -{ - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_permutevar_ps(__A, __C), - (__v8sf)__W); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_maskz_permutevar_ps(__mmask8 __U, __m256 __A, __m256i __C) -{ - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_permutevar_ps(__A, __C), - (__v8sf)_mm256_setzero_ps()); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS128 -_mm_test_epi32_mask (__m128i __A, __m128i __B) -{ - return _mm_cmpneq_epi32_mask (_mm_and_si128 (__A, __B), _mm_setzero_si128()); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS128 -_mm_mask_test_epi32_mask (__mmask8 __U, __m128i __A, __m128i __B) -{ - return _mm_mask_cmpneq_epi32_mask (__U, _mm_and_si128 (__A, __B), - _mm_setzero_si128()); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS256 -_mm256_test_epi32_mask (__m256i __A, __m256i __B) -{ - return _mm256_cmpneq_epi32_mask (_mm256_and_si256 (__A, __B), - _mm256_setzero_si256()); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS256 -_mm256_mask_test_epi32_mask (__mmask8 __U, __m256i __A, __m256i __B) -{ - return _mm256_mask_cmpneq_epi32_mask (__U, _mm256_and_si256 (__A, __B), - _mm256_setzero_si256()); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS128 -_mm_test_epi64_mask (__m128i __A, __m128i __B) -{ - return _mm_cmpneq_epi64_mask (_mm_and_si128 (__A, __B), _mm_setzero_si128()); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS128 -_mm_mask_test_epi64_mask (__mmask8 __U, __m128i __A, __m128i __B) -{ - return _mm_mask_cmpneq_epi64_mask (__U, _mm_and_si128 (__A, __B), - _mm_setzero_si128()); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS256 -_mm256_test_epi64_mask (__m256i __A, __m256i __B) -{ - return _mm256_cmpneq_epi64_mask (_mm256_and_si256 (__A, __B), - _mm256_setzero_si256()); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS256 -_mm256_mask_test_epi64_mask (__mmask8 __U, __m256i __A, __m256i __B) -{ - return _mm256_mask_cmpneq_epi64_mask (__U, _mm256_and_si256 (__A, __B), - _mm256_setzero_si256()); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS128 -_mm_testn_epi32_mask (__m128i __A, __m128i __B) -{ - return _mm_cmpeq_epi32_mask (_mm_and_si128 (__A, __B), _mm_setzero_si128()); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS128 -_mm_mask_testn_epi32_mask (__mmask8 __U, __m128i __A, __m128i __B) -{ - return _mm_mask_cmpeq_epi32_mask (__U, _mm_and_si128 (__A, __B), - _mm_setzero_si128()); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS256 -_mm256_testn_epi32_mask (__m256i __A, __m256i __B) -{ - return _mm256_cmpeq_epi32_mask (_mm256_and_si256 (__A, __B), - _mm256_setzero_si256()); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS256 -_mm256_mask_testn_epi32_mask (__mmask8 __U, __m256i __A, __m256i __B) -{ - return _mm256_mask_cmpeq_epi32_mask (__U, _mm256_and_si256 (__A, __B), - _mm256_setzero_si256()); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS128 -_mm_testn_epi64_mask (__m128i __A, __m128i __B) -{ - return _mm_cmpeq_epi64_mask (_mm_and_si128 (__A, __B), _mm_setzero_si128()); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS128 -_mm_mask_testn_epi64_mask (__mmask8 __U, __m128i __A, __m128i __B) -{ - return _mm_mask_cmpeq_epi64_mask (__U, _mm_and_si128 (__A, __B), - _mm_setzero_si128()); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS256 -_mm256_testn_epi64_mask (__m256i __A, __m256i __B) -{ - return _mm256_cmpeq_epi64_mask (_mm256_and_si256 (__A, __B), - _mm256_setzero_si256()); -} - -static __inline__ __mmask8 __DEFAULT_FN_ATTRS256 -_mm256_mask_testn_epi64_mask (__mmask8 __U, __m256i __A, __m256i __B) -{ - return _mm256_mask_cmpeq_epi64_mask (__U, _mm256_and_si256 (__A, __B), - _mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_unpackhi_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_unpackhi_epi32(__A, __B), - (__v4si)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_unpackhi_epi32(__mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_unpackhi_epi32(__A, __B), - (__v4si)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_unpackhi_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_unpackhi_epi32(__A, __B), - (__v8si)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_unpackhi_epi32(__mmask8 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_unpackhi_epi32(__A, __B), - (__v8si)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_unpackhi_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_unpackhi_epi64(__A, __B), - (__v2di)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_unpackhi_epi64(__mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_unpackhi_epi64(__A, __B), - (__v2di)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_unpackhi_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_unpackhi_epi64(__A, __B), - (__v4di)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_unpackhi_epi64(__mmask8 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_unpackhi_epi64(__A, __B), - (__v4di)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_unpacklo_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_unpacklo_epi32(__A, __B), - (__v4si)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_unpacklo_epi32(__mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_unpacklo_epi32(__A, __B), - (__v4si)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_unpacklo_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_unpacklo_epi32(__A, __B), - (__v8si)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_unpacklo_epi32(__mmask8 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_unpacklo_epi32(__A, __B), - (__v8si)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_unpacklo_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_unpacklo_epi64(__A, __B), - (__v2di)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_unpacklo_epi64(__mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, - (__v2di)_mm_unpacklo_epi64(__A, __B), - (__v2di)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_unpacklo_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_unpacklo_epi64(__A, __B), - (__v4di)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_unpacklo_epi64(__mmask8 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, - (__v4di)_mm256_unpacklo_epi64(__A, __B), - (__v4di)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_sra_epi32(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_sra_epi32(__A, __B), - (__v4si)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_sra_epi32(__mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_sra_epi32(__A, __B), - (__v4si)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_sra_epi32(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) -{ - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_sra_epi32(__A, __B), - (__v8si)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_sra_epi32(__mmask8 __U, __m256i __A, __m128i __B) -{ - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_sra_epi32(__A, __B), - (__v8si)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_srai_epi32(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __B) -{ - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_srai_epi32(__A, __B), - (__v4si)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_srai_epi32(__mmask8 __U, __m128i __A, unsigned int __B) -{ - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__U, - (__v4si)_mm_srai_epi32(__A, __B), - (__v4si)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_srai_epi32(__m256i __W, __mmask8 __U, __m256i __A, unsigned int __B) -{ - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_srai_epi32(__A, __B), - (__v8si)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_srai_epi32(__mmask8 __U, __m256i __A, unsigned int __B) -{ - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__U, - (__v8si)_mm256_srai_epi32(__A, __B), - (__v8si)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_sra_epi64(__m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_psraq128((__v2di)__A, (__v2di)__B); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_sra_epi64(__m128i __W, __mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \ - (__v2di)_mm_sra_epi64(__A, __B), \ - (__v2di)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_sra_epi64(__mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \ - (__v2di)_mm_sra_epi64(__A, __B), \ - (__v2di)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_sra_epi64(__m256i __A, __m128i __B) -{ - return (__m256i)__builtin_ia32_psraq256((__v4di) __A, (__v2di) __B); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_sra_epi64(__m256i __W, __mmask8 __U, __m256i __A, __m128i __B) -{ - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \ - (__v4di)_mm256_sra_epi64(__A, __B), \ - (__v4di)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_sra_epi64(__mmask8 __U, __m256i __A, __m128i __B) -{ - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \ - (__v4di)_mm256_sra_epi64(__A, __B), \ - (__v4di)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_srai_epi64(__m128i __A, unsigned int __imm) -{ - return (__m128i)__builtin_ia32_psraqi128((__v2di)__A, __imm); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_srai_epi64(__m128i __W, __mmask8 __U, __m128i __A, unsigned int __imm) -{ - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \ - (__v2di)_mm_srai_epi64(__A, __imm), \ - (__v2di)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_srai_epi64(__mmask8 __U, __m128i __A, unsigned int __imm) -{ - return (__m128i)__builtin_ia32_selectq_128((__mmask8)__U, \ - (__v2di)_mm_srai_epi64(__A, __imm), \ - (__v2di)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_srai_epi64(__m256i __A, unsigned int __imm) -{ - return (__m256i)__builtin_ia32_psraqi256((__v4di)__A, __imm); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_srai_epi64(__m256i __W, __mmask8 __U, __m256i __A, - unsigned int __imm) -{ - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \ - (__v4di)_mm256_srai_epi64(__A, __imm), \ - (__v4di)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_srai_epi64(__mmask8 __U, __m256i __A, unsigned int __imm) -{ - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__U, \ - (__v4di)_mm256_srai_epi64(__A, __imm), \ - (__v4di)_mm256_setzero_si256()); -} - -#define _mm_ternarylogic_epi32(A, B, C, imm) \ - ((__m128i)__builtin_ia32_pternlogd128_mask((__v4si)(__m128i)(A), \ - (__v4si)(__m128i)(B), \ - (__v4si)(__m128i)(C), (int)(imm), \ - (__mmask8)-1)) - -#define _mm_mask_ternarylogic_epi32(A, U, B, C, imm) \ - ((__m128i)__builtin_ia32_pternlogd128_mask((__v4si)(__m128i)(A), \ - (__v4si)(__m128i)(B), \ - (__v4si)(__m128i)(C), (int)(imm), \ - (__mmask8)(U))) - -#define _mm_maskz_ternarylogic_epi32(U, A, B, C, imm) \ - ((__m128i)__builtin_ia32_pternlogd128_maskz((__v4si)(__m128i)(A), \ - (__v4si)(__m128i)(B), \ - (__v4si)(__m128i)(C), (int)(imm), \ - (__mmask8)(U))) - -#define _mm256_ternarylogic_epi32(A, B, C, imm) \ - ((__m256i)__builtin_ia32_pternlogd256_mask((__v8si)(__m256i)(A), \ - (__v8si)(__m256i)(B), \ - (__v8si)(__m256i)(C), (int)(imm), \ - (__mmask8)-1)) - -#define _mm256_mask_ternarylogic_epi32(A, U, B, C, imm) \ - ((__m256i)__builtin_ia32_pternlogd256_mask((__v8si)(__m256i)(A), \ - (__v8si)(__m256i)(B), \ - (__v8si)(__m256i)(C), (int)(imm), \ - (__mmask8)(U))) - -#define _mm256_maskz_ternarylogic_epi32(U, A, B, C, imm) \ - ((__m256i)__builtin_ia32_pternlogd256_maskz((__v8si)(__m256i)(A), \ - (__v8si)(__m256i)(B), \ - (__v8si)(__m256i)(C), (int)(imm), \ - (__mmask8)(U))) - -#define _mm_ternarylogic_epi64(A, B, C, imm) \ - ((__m128i)__builtin_ia32_pternlogq128_mask((__v2di)(__m128i)(A), \ - (__v2di)(__m128i)(B), \ - (__v2di)(__m128i)(C), (int)(imm), \ - (__mmask8)-1)) - -#define _mm_mask_ternarylogic_epi64(A, U, B, C, imm) \ - ((__m128i)__builtin_ia32_pternlogq128_mask((__v2di)(__m128i)(A), \ - (__v2di)(__m128i)(B), \ - (__v2di)(__m128i)(C), (int)(imm), \ - (__mmask8)(U))) - -#define _mm_maskz_ternarylogic_epi64(U, A, B, C, imm) \ - ((__m128i)__builtin_ia32_pternlogq128_maskz((__v2di)(__m128i)(A), \ - (__v2di)(__m128i)(B), \ - (__v2di)(__m128i)(C), (int)(imm), \ - (__mmask8)(U))) - -#define _mm256_ternarylogic_epi64(A, B, C, imm) \ - ((__m256i)__builtin_ia32_pternlogq256_mask((__v4di)(__m256i)(A), \ - (__v4di)(__m256i)(B), \ - (__v4di)(__m256i)(C), (int)(imm), \ - (__mmask8)-1)) - -#define _mm256_mask_ternarylogic_epi64(A, U, B, C, imm) \ - ((__m256i)__builtin_ia32_pternlogq256_mask((__v4di)(__m256i)(A), \ - (__v4di)(__m256i)(B), \ - (__v4di)(__m256i)(C), (int)(imm), \ - (__mmask8)(U))) - -#define _mm256_maskz_ternarylogic_epi64(U, A, B, C, imm) \ - ((__m256i)__builtin_ia32_pternlogq256_maskz((__v4di)(__m256i)(A), \ - (__v4di)(__m256i)(B), \ - (__v4di)(__m256i)(C), (int)(imm), \ - (__mmask8)(U))) - - - -#define _mm256_shuffle_f32x4(A, B, imm) \ - ((__m256)__builtin_ia32_shuf_f32x4_256((__v8sf)(__m256)(A), \ - (__v8sf)(__m256)(B), (int)(imm))) - -#define _mm256_mask_shuffle_f32x4(W, U, A, B, imm) \ - ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ - (__v8sf)_mm256_shuffle_f32x4((A), (B), (imm)), \ - (__v8sf)(__m256)(W))) - -#define _mm256_maskz_shuffle_f32x4(U, A, B, imm) \ - ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ - (__v8sf)_mm256_shuffle_f32x4((A), (B), (imm)), \ - (__v8sf)_mm256_setzero_ps())) - -#define _mm256_shuffle_f64x2(A, B, imm) \ - ((__m256d)__builtin_ia32_shuf_f64x2_256((__v4df)(__m256d)(A), \ - (__v4df)(__m256d)(B), (int)(imm))) - -#define _mm256_mask_shuffle_f64x2(W, U, A, B, imm) \ - ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ - (__v4df)_mm256_shuffle_f64x2((A), (B), (imm)), \ - (__v4df)(__m256d)(W))) - -#define _mm256_maskz_shuffle_f64x2(U, A, B, imm) \ - ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ - (__v4df)_mm256_shuffle_f64x2((A), (B), (imm)), \ - (__v4df)_mm256_setzero_pd())) - -#define _mm256_shuffle_i32x4(A, B, imm) \ - ((__m256i)__builtin_ia32_shuf_i32x4_256((__v8si)(__m256i)(A), \ - (__v8si)(__m256i)(B), (int)(imm))) - -#define _mm256_mask_shuffle_i32x4(W, U, A, B, imm) \ - ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ - (__v8si)_mm256_shuffle_i32x4((A), (B), (imm)), \ - (__v8si)(__m256i)(W))) - -#define _mm256_maskz_shuffle_i32x4(U, A, B, imm) \ - ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ - (__v8si)_mm256_shuffle_i32x4((A), (B), (imm)), \ - (__v8si)_mm256_setzero_si256())) - -#define _mm256_shuffle_i64x2(A, B, imm) \ - ((__m256i)__builtin_ia32_shuf_i64x2_256((__v4di)(__m256i)(A), \ - (__v4di)(__m256i)(B), (int)(imm))) - -#define _mm256_mask_shuffle_i64x2(W, U, A, B, imm) \ - ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ - (__v4di)_mm256_shuffle_i64x2((A), (B), (imm)), \ - (__v4di)(__m256i)(W))) - - -#define _mm256_maskz_shuffle_i64x2(U, A, B, imm) \ - ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ - (__v4di)_mm256_shuffle_i64x2((A), (B), (imm)), \ - (__v4di)_mm256_setzero_si256())) - -#define _mm_mask_shuffle_pd(W, U, A, B, M) \ - ((__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \ - (__v2df)_mm_shuffle_pd((A), (B), (M)), \ - (__v2df)(__m128d)(W))) - -#define _mm_maskz_shuffle_pd(U, A, B, M) \ - ((__m128d)__builtin_ia32_selectpd_128((__mmask8)(U), \ - (__v2df)_mm_shuffle_pd((A), (B), (M)), \ - (__v2df)_mm_setzero_pd())) - -#define _mm256_mask_shuffle_pd(W, U, A, B, M) \ - ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ - (__v4df)_mm256_shuffle_pd((A), (B), (M)), \ - (__v4df)(__m256d)(W))) - -#define _mm256_maskz_shuffle_pd(U, A, B, M) \ - ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ - (__v4df)_mm256_shuffle_pd((A), (B), (M)), \ - (__v4df)_mm256_setzero_pd())) - -#define _mm_mask_shuffle_ps(W, U, A, B, M) \ - ((__m128)__builtin_ia32_selectps_128((__mmask8)(U), \ - (__v4sf)_mm_shuffle_ps((A), (B), (M)), \ - (__v4sf)(__m128)(W))) - -#define _mm_maskz_shuffle_ps(U, A, B, M) \ - ((__m128)__builtin_ia32_selectps_128((__mmask8)(U), \ - (__v4sf)_mm_shuffle_ps((A), (B), (M)), \ - (__v4sf)_mm_setzero_ps())) - -#define _mm256_mask_shuffle_ps(W, U, A, B, M) \ - ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ - (__v8sf)_mm256_shuffle_ps((A), (B), (M)), \ - (__v8sf)(__m256)(W))) - -#define _mm256_maskz_shuffle_ps(U, A, B, M) \ - ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ - (__v8sf)_mm256_shuffle_ps((A), (B), (M)), \ - (__v8sf)_mm256_setzero_ps())) - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_rsqrt14_pd (__m128d __A) -{ - return (__m128d) __builtin_ia32_rsqrt14pd128_mask ((__v2df) __A, - (__v2df) - _mm_setzero_pd (), - (__mmask8) -1); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_rsqrt14_pd (__m128d __W, __mmask8 __U, __m128d __A) -{ - return (__m128d) __builtin_ia32_rsqrt14pd128_mask ((__v2df) __A, - (__v2df) __W, - (__mmask8) __U); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_rsqrt14_pd (__mmask8 __U, __m128d __A) -{ - return (__m128d) __builtin_ia32_rsqrt14pd128_mask ((__v2df) __A, - (__v2df) - _mm_setzero_pd (), - (__mmask8) __U); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_rsqrt14_pd (__m256d __A) -{ - return (__m256d) __builtin_ia32_rsqrt14pd256_mask ((__v4df) __A, - (__v4df) - _mm256_setzero_pd (), - (__mmask8) -1); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_mask_rsqrt14_pd (__m256d __W, __mmask8 __U, __m256d __A) -{ - return (__m256d) __builtin_ia32_rsqrt14pd256_mask ((__v4df) __A, - (__v4df) __W, - (__mmask8) __U); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_maskz_rsqrt14_pd (__mmask8 __U, __m256d __A) -{ - return (__m256d) __builtin_ia32_rsqrt14pd256_mask ((__v4df) __A, - (__v4df) - _mm256_setzero_pd (), - (__mmask8) __U); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_rsqrt14_ps (__m128 __A) -{ - return (__m128) __builtin_ia32_rsqrt14ps128_mask ((__v4sf) __A, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) -1); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_rsqrt14_ps (__m128 __W, __mmask8 __U, __m128 __A) -{ - return (__m128) __builtin_ia32_rsqrt14ps128_mask ((__v4sf) __A, - (__v4sf) __W, - (__mmask8) __U); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_rsqrt14_ps (__mmask8 __U, __m128 __A) -{ - return (__m128) __builtin_ia32_rsqrt14ps128_mask ((__v4sf) __A, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_rsqrt14_ps (__m256 __A) -{ - return (__m256) __builtin_ia32_rsqrt14ps256_mask ((__v8sf) __A, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) -1); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_mask_rsqrt14_ps (__m256 __W, __mmask8 __U, __m256 __A) -{ - return (__m256) __builtin_ia32_rsqrt14ps256_mask ((__v8sf) __A, - (__v8sf) __W, - (__mmask8) __U); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_maskz_rsqrt14_ps (__mmask8 __U, __m256 __A) -{ - return (__m256) __builtin_ia32_rsqrt14ps256_mask ((__v8sf) __A, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) __U); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_broadcast_f32x4(__m128 __A) -{ - return (__m256)__builtin_shufflevector((__v4sf)__A, (__v4sf)__A, - 0, 1, 2, 3, 0, 1, 2, 3); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_mask_broadcast_f32x4(__m256 __O, __mmask8 __M, __m128 __A) -{ - return (__m256)__builtin_ia32_selectps_256((__mmask8)__M, - (__v8sf)_mm256_broadcast_f32x4(__A), - (__v8sf)__O); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_maskz_broadcast_f32x4 (__mmask8 __M, __m128 __A) -{ - return (__m256)__builtin_ia32_selectps_256((__mmask8)__M, - (__v8sf)_mm256_broadcast_f32x4(__A), - (__v8sf)_mm256_setzero_ps()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_broadcast_i32x4(__m128i __A) -{ - return (__m256i)__builtin_shufflevector((__v4si)__A, (__v4si)__A, - 0, 1, 2, 3, 0, 1, 2, 3); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_broadcast_i32x4(__m256i __O, __mmask8 __M, __m128i __A) -{ - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, - (__v8si)_mm256_broadcast_i32x4(__A), - (__v8si)__O); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_broadcast_i32x4(__mmask8 __M, __m128i __A) -{ - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, - (__v8si)_mm256_broadcast_i32x4(__A), - (__v8si)_mm256_setzero_si256()); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_mask_broadcastsd_pd (__m256d __O, __mmask8 __M, __m128d __A) -{ - return (__m256d)__builtin_ia32_selectpd_256(__M, - (__v4df) _mm256_broadcastsd_pd(__A), - (__v4df) __O); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_maskz_broadcastsd_pd (__mmask8 __M, __m128d __A) -{ - return (__m256d)__builtin_ia32_selectpd_256(__M, - (__v4df) _mm256_broadcastsd_pd(__A), - (__v4df) _mm256_setzero_pd()); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_broadcastss_ps (__m128 __O, __mmask8 __M, __m128 __A) -{ - return (__m128)__builtin_ia32_selectps_128(__M, - (__v4sf) _mm_broadcastss_ps(__A), - (__v4sf) __O); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_broadcastss_ps (__mmask8 __M, __m128 __A) -{ - return (__m128)__builtin_ia32_selectps_128(__M, - (__v4sf) _mm_broadcastss_ps(__A), - (__v4sf) _mm_setzero_ps()); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_mask_broadcastss_ps (__m256 __O, __mmask8 __M, __m128 __A) -{ - return (__m256)__builtin_ia32_selectps_256(__M, - (__v8sf) _mm256_broadcastss_ps(__A), - (__v8sf) __O); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_maskz_broadcastss_ps (__mmask8 __M, __m128 __A) -{ - return (__m256)__builtin_ia32_selectps_256(__M, - (__v8sf) _mm256_broadcastss_ps(__A), - (__v8sf) _mm256_setzero_ps()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_broadcastd_epi32 (__m128i __O, __mmask8 __M, __m128i __A) -{ - return (__m128i)__builtin_ia32_selectd_128(__M, - (__v4si) _mm_broadcastd_epi32(__A), - (__v4si) __O); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_broadcastd_epi32 (__mmask8 __M, __m128i __A) -{ - return (__m128i)__builtin_ia32_selectd_128(__M, - (__v4si) _mm_broadcastd_epi32(__A), - (__v4si) _mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_broadcastd_epi32 (__m256i __O, __mmask8 __M, __m128i __A) -{ - return (__m256i)__builtin_ia32_selectd_256(__M, - (__v8si) _mm256_broadcastd_epi32(__A), - (__v8si) __O); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_broadcastd_epi32 (__mmask8 __M, __m128i __A) -{ - return (__m256i)__builtin_ia32_selectd_256(__M, - (__v8si) _mm256_broadcastd_epi32(__A), - (__v8si) _mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_broadcastq_epi64 (__m128i __O, __mmask8 __M, __m128i __A) -{ - return (__m128i)__builtin_ia32_selectq_128(__M, - (__v2di) _mm_broadcastq_epi64(__A), - (__v2di) __O); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A) -{ - return (__m128i)__builtin_ia32_selectq_128(__M, - (__v2di) _mm_broadcastq_epi64(__A), - (__v2di) _mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_broadcastq_epi64 (__m256i __O, __mmask8 __M, __m128i __A) -{ - return (__m256i)__builtin_ia32_selectq_256(__M, - (__v4di) _mm256_broadcastq_epi64(__A), - (__v4di) __O); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_broadcastq_epi64 (__mmask8 __M, __m128i __A) -{ - return (__m256i)__builtin_ia32_selectq_256(__M, - (__v4di) _mm256_broadcastq_epi64(__A), - (__v4di) _mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_cvtsepi32_epi8 (__m128i __A) -{ - return (__m128i) __builtin_ia32_pmovsdb128_mask ((__v4si) __A, - (__v16qi)_mm_undefined_si128(), - (__mmask8) -1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvtsepi32_epi8 (__m128i __O, __mmask8 __M, __m128i __A) -{ - return (__m128i) __builtin_ia32_pmovsdb128_mask ((__v4si) __A, - (__v16qi) __O, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtsepi32_epi8 (__mmask8 __M, __m128i __A) -{ - return (__m128i) __builtin_ia32_pmovsdb128_mask ((__v4si) __A, - (__v16qi) _mm_setzero_si128 (), - __M); -} - -static __inline__ void __DEFAULT_FN_ATTRS128 -_mm_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A) -{ - __builtin_ia32_pmovsdb128mem_mask ((__v16qi *) __P, (__v4si) __A, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_cvtsepi32_epi8 (__m256i __A) -{ - return (__m128i) __builtin_ia32_pmovsdb256_mask ((__v8si) __A, - (__v16qi)_mm_undefined_si128(), - (__mmask8) -1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtsepi32_epi8 (__m128i __O, __mmask8 __M, __m256i __A) -{ - return (__m128i) __builtin_ia32_pmovsdb256_mask ((__v8si) __A, - (__v16qi) __O, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtsepi32_epi8 (__mmask8 __M, __m256i __A) -{ - return (__m128i) __builtin_ia32_pmovsdb256_mask ((__v8si) __A, - (__v16qi) _mm_setzero_si128 (), - __M); -} - -static __inline__ void __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtsepi32_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A) -{ - __builtin_ia32_pmovsdb256mem_mask ((__v16qi *) __P, (__v8si) __A, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_cvtsepi32_epi16 (__m128i __A) -{ - return (__m128i) __builtin_ia32_pmovsdw128_mask ((__v4si) __A, - (__v8hi)_mm_setzero_si128 (), - (__mmask8) -1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvtsepi32_epi16 (__m128i __O, __mmask8 __M, __m128i __A) -{ - return (__m128i) __builtin_ia32_pmovsdw128_mask ((__v4si) __A, - (__v8hi)__O, - __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtsepi32_epi16 (__mmask8 __M, __m128i __A) -{ - return (__m128i) __builtin_ia32_pmovsdw128_mask ((__v4si) __A, - (__v8hi) _mm_setzero_si128 (), - __M); -} - -static __inline__ void __DEFAULT_FN_ATTRS128 -_mm_mask_cvtsepi32_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A) -{ - __builtin_ia32_pmovsdw128mem_mask ((__v8hi *) __P, (__v4si) __A, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_cvtsepi32_epi16 (__m256i __A) -{ - return (__m128i) __builtin_ia32_pmovsdw256_mask ((__v8si) __A, - (__v8hi)_mm_undefined_si128(), - (__mmask8) -1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtsepi32_epi16 (__m128i __O, __mmask8 __M, __m256i __A) -{ - return (__m128i) __builtin_ia32_pmovsdw256_mask ((__v8si) __A, - (__v8hi) __O, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtsepi32_epi16 (__mmask8 __M, __m256i __A) -{ - return (__m128i) __builtin_ia32_pmovsdw256_mask ((__v8si) __A, - (__v8hi) _mm_setzero_si128 (), - __M); -} - -static __inline__ void __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtsepi32_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A) -{ - __builtin_ia32_pmovsdw256mem_mask ((__v8hi *) __P, (__v8si) __A, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_cvtsepi64_epi8 (__m128i __A) -{ - return (__m128i) __builtin_ia32_pmovsqb128_mask ((__v2di) __A, - (__v16qi)_mm_undefined_si128(), - (__mmask8) -1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m128i __A) -{ - return (__m128i) __builtin_ia32_pmovsqb128_mask ((__v2di) __A, - (__v16qi) __O, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtsepi64_epi8 (__mmask8 __M, __m128i __A) -{ - return (__m128i) __builtin_ia32_pmovsqb128_mask ((__v2di) __A, - (__v16qi) _mm_setzero_si128 (), - __M); -} - -static __inline__ void __DEFAULT_FN_ATTRS128 -_mm_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A) -{ - __builtin_ia32_pmovsqb128mem_mask ((__v16qi *) __P, (__v2di) __A, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_cvtsepi64_epi8 (__m256i __A) -{ - return (__m128i) __builtin_ia32_pmovsqb256_mask ((__v4di) __A, - (__v16qi)_mm_undefined_si128(), - (__mmask8) -1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtsepi64_epi8 (__m128i __O, __mmask8 __M, __m256i __A) -{ - return (__m128i) __builtin_ia32_pmovsqb256_mask ((__v4di) __A, - (__v16qi) __O, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtsepi64_epi8 (__mmask8 __M, __m256i __A) -{ - return (__m128i) __builtin_ia32_pmovsqb256_mask ((__v4di) __A, - (__v16qi) _mm_setzero_si128 (), - __M); -} - -static __inline__ void __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtsepi64_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A) -{ - __builtin_ia32_pmovsqb256mem_mask ((__v16qi *) __P, (__v4di) __A, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_cvtsepi64_epi32 (__m128i __A) -{ - return (__m128i) __builtin_ia32_pmovsqd128_mask ((__v2di) __A, - (__v4si)_mm_undefined_si128(), - (__mmask8) -1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvtsepi64_epi32 (__m128i __O, __mmask8 __M, __m128i __A) -{ - return (__m128i) __builtin_ia32_pmovsqd128_mask ((__v2di) __A, - (__v4si) __O, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtsepi64_epi32 (__mmask8 __M, __m128i __A) -{ - return (__m128i) __builtin_ia32_pmovsqd128_mask ((__v2di) __A, - (__v4si) _mm_setzero_si128 (), - __M); -} - -static __inline__ void __DEFAULT_FN_ATTRS128 -_mm_mask_cvtsepi64_storeu_epi32 (void * __P, __mmask8 __M, __m128i __A) -{ - __builtin_ia32_pmovsqd128mem_mask ((__v4si *) __P, (__v2di) __A, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_cvtsepi64_epi32 (__m256i __A) -{ - return (__m128i) __builtin_ia32_pmovsqd256_mask ((__v4di) __A, - (__v4si)_mm_undefined_si128(), - (__mmask8) -1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtsepi64_epi32 (__m128i __O, __mmask8 __M, __m256i __A) -{ - return (__m128i) __builtin_ia32_pmovsqd256_mask ((__v4di) __A, - (__v4si)__O, - __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtsepi64_epi32 (__mmask8 __M, __m256i __A) -{ - return (__m128i) __builtin_ia32_pmovsqd256_mask ((__v4di) __A, - (__v4si) _mm_setzero_si128 (), - __M); -} - -static __inline__ void __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtsepi64_storeu_epi32 (void * __P, __mmask8 __M, __m256i __A) -{ - __builtin_ia32_pmovsqd256mem_mask ((__v4si *) __P, (__v4di) __A, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_cvtsepi64_epi16 (__m128i __A) -{ - return (__m128i) __builtin_ia32_pmovsqw128_mask ((__v2di) __A, - (__v8hi)_mm_undefined_si128(), - (__mmask8) -1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m128i __A) -{ - return (__m128i) __builtin_ia32_pmovsqw128_mask ((__v2di) __A, - (__v8hi) __O, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtsepi64_epi16 (__mmask8 __M, __m128i __A) -{ - return (__m128i) __builtin_ia32_pmovsqw128_mask ((__v2di) __A, - (__v8hi) _mm_setzero_si128 (), - __M); -} - -static __inline__ void __DEFAULT_FN_ATTRS128 -_mm_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A) -{ - __builtin_ia32_pmovsqw128mem_mask ((__v8hi *) __P, (__v2di) __A, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_cvtsepi64_epi16 (__m256i __A) -{ - return (__m128i) __builtin_ia32_pmovsqw256_mask ((__v4di) __A, - (__v8hi)_mm_undefined_si128(), - (__mmask8) -1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtsepi64_epi16 (__m128i __O, __mmask8 __M, __m256i __A) -{ - return (__m128i) __builtin_ia32_pmovsqw256_mask ((__v4di) __A, - (__v8hi) __O, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtsepi64_epi16 (__mmask8 __M, __m256i __A) -{ - return (__m128i) __builtin_ia32_pmovsqw256_mask ((__v4di) __A, - (__v8hi) _mm_setzero_si128 (), - __M); -} - -static __inline__ void __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtsepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A) -{ - __builtin_ia32_pmovsqw256mem_mask ((__v8hi *) __P, (__v4di) __A, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_cvtusepi32_epi8 (__m128i __A) -{ - return (__m128i) __builtin_ia32_pmovusdb128_mask ((__v4si) __A, - (__v16qi)_mm_undefined_si128(), - (__mmask8) -1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvtusepi32_epi8 (__m128i __O, __mmask8 __M, __m128i __A) -{ - return (__m128i) __builtin_ia32_pmovusdb128_mask ((__v4si) __A, - (__v16qi) __O, - __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtusepi32_epi8 (__mmask8 __M, __m128i __A) -{ - return (__m128i) __builtin_ia32_pmovusdb128_mask ((__v4si) __A, - (__v16qi) _mm_setzero_si128 (), - __M); -} - -static __inline__ void __DEFAULT_FN_ATTRS128 -_mm_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A) -{ - __builtin_ia32_pmovusdb128mem_mask ((__v16qi *) __P, (__v4si) __A, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_cvtusepi32_epi8 (__m256i __A) -{ - return (__m128i) __builtin_ia32_pmovusdb256_mask ((__v8si) __A, - (__v16qi)_mm_undefined_si128(), - (__mmask8) -1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtusepi32_epi8 (__m128i __O, __mmask8 __M, __m256i __A) -{ - return (__m128i) __builtin_ia32_pmovusdb256_mask ((__v8si) __A, - (__v16qi) __O, - __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtusepi32_epi8 (__mmask8 __M, __m256i __A) -{ - return (__m128i) __builtin_ia32_pmovusdb256_mask ((__v8si) __A, - (__v16qi) _mm_setzero_si128 (), - __M); -} - -static __inline__ void __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtusepi32_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A) -{ - __builtin_ia32_pmovusdb256mem_mask ((__v16qi*) __P, (__v8si) __A, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_cvtusepi32_epi16 (__m128i __A) -{ - return (__m128i) __builtin_ia32_pmovusdw128_mask ((__v4si) __A, - (__v8hi)_mm_undefined_si128(), - (__mmask8) -1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvtusepi32_epi16 (__m128i __O, __mmask8 __M, __m128i __A) -{ - return (__m128i) __builtin_ia32_pmovusdw128_mask ((__v4si) __A, - (__v8hi) __O, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtusepi32_epi16 (__mmask8 __M, __m128i __A) -{ - return (__m128i) __builtin_ia32_pmovusdw128_mask ((__v4si) __A, - (__v8hi) _mm_setzero_si128 (), - __M); -} - -static __inline__ void __DEFAULT_FN_ATTRS128 -_mm_mask_cvtusepi32_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A) -{ - __builtin_ia32_pmovusdw128mem_mask ((__v8hi *) __P, (__v4si) __A, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_cvtusepi32_epi16 (__m256i __A) -{ - return (__m128i) __builtin_ia32_pmovusdw256_mask ((__v8si) __A, - (__v8hi) _mm_undefined_si128(), - (__mmask8) -1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtusepi32_epi16 (__m128i __O, __mmask8 __M, __m256i __A) -{ - return (__m128i) __builtin_ia32_pmovusdw256_mask ((__v8si) __A, - (__v8hi) __O, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtusepi32_epi16 (__mmask8 __M, __m256i __A) -{ - return (__m128i) __builtin_ia32_pmovusdw256_mask ((__v8si) __A, - (__v8hi) _mm_setzero_si128 (), - __M); -} - -static __inline__ void __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtusepi32_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A) -{ - __builtin_ia32_pmovusdw256mem_mask ((__v8hi *) __P, (__v8si) __A, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_cvtusepi64_epi8 (__m128i __A) -{ - return (__m128i) __builtin_ia32_pmovusqb128_mask ((__v2di) __A, - (__v16qi)_mm_undefined_si128(), - (__mmask8) -1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m128i __A) -{ - return (__m128i) __builtin_ia32_pmovusqb128_mask ((__v2di) __A, - (__v16qi) __O, - __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtusepi64_epi8 (__mmask8 __M, __m128i __A) -{ - return (__m128i) __builtin_ia32_pmovusqb128_mask ((__v2di) __A, - (__v16qi) _mm_setzero_si128 (), - __M); -} - -static __inline__ void __DEFAULT_FN_ATTRS128 -_mm_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A) -{ - __builtin_ia32_pmovusqb128mem_mask ((__v16qi *) __P, (__v2di) __A, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_cvtusepi64_epi8 (__m256i __A) -{ - return (__m128i) __builtin_ia32_pmovusqb256_mask ((__v4di) __A, - (__v16qi)_mm_undefined_si128(), - (__mmask8) -1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtusepi64_epi8 (__m128i __O, __mmask8 __M, __m256i __A) -{ - return (__m128i) __builtin_ia32_pmovusqb256_mask ((__v4di) __A, - (__v16qi) __O, - __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtusepi64_epi8 (__mmask8 __M, __m256i __A) -{ - return (__m128i) __builtin_ia32_pmovusqb256_mask ((__v4di) __A, - (__v16qi) _mm_setzero_si128 (), - __M); -} - -static __inline__ void __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtusepi64_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A) -{ - __builtin_ia32_pmovusqb256mem_mask ((__v16qi *) __P, (__v4di) __A, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_cvtusepi64_epi32 (__m128i __A) -{ - return (__m128i) __builtin_ia32_pmovusqd128_mask ((__v2di) __A, - (__v4si)_mm_undefined_si128(), - (__mmask8) -1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvtusepi64_epi32 (__m128i __O, __mmask8 __M, __m128i __A) -{ - return (__m128i) __builtin_ia32_pmovusqd128_mask ((__v2di) __A, - (__v4si) __O, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtusepi64_epi32 (__mmask8 __M, __m128i __A) -{ - return (__m128i) __builtin_ia32_pmovusqd128_mask ((__v2di) __A, - (__v4si) _mm_setzero_si128 (), - __M); -} - -static __inline__ void __DEFAULT_FN_ATTRS128 -_mm_mask_cvtusepi64_storeu_epi32 (void * __P, __mmask8 __M, __m128i __A) -{ - __builtin_ia32_pmovusqd128mem_mask ((__v4si *) __P, (__v2di) __A, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_cvtusepi64_epi32 (__m256i __A) -{ - return (__m128i) __builtin_ia32_pmovusqd256_mask ((__v4di) __A, - (__v4si)_mm_undefined_si128(), - (__mmask8) -1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtusepi64_epi32 (__m128i __O, __mmask8 __M, __m256i __A) -{ - return (__m128i) __builtin_ia32_pmovusqd256_mask ((__v4di) __A, - (__v4si) __O, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtusepi64_epi32 (__mmask8 __M, __m256i __A) -{ - return (__m128i) __builtin_ia32_pmovusqd256_mask ((__v4di) __A, - (__v4si) _mm_setzero_si128 (), - __M); -} - -static __inline__ void __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtusepi64_storeu_epi32 (void * __P, __mmask8 __M, __m256i __A) -{ - __builtin_ia32_pmovusqd256mem_mask ((__v4si *) __P, (__v4di) __A, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_cvtusepi64_epi16 (__m128i __A) -{ - return (__m128i) __builtin_ia32_pmovusqw128_mask ((__v2di) __A, - (__v8hi)_mm_undefined_si128(), - (__mmask8) -1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m128i __A) -{ - return (__m128i) __builtin_ia32_pmovusqw128_mask ((__v2di) __A, - (__v8hi) __O, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtusepi64_epi16 (__mmask8 __M, __m128i __A) -{ - return (__m128i) __builtin_ia32_pmovusqw128_mask ((__v2di) __A, - (__v8hi) _mm_setzero_si128 (), - __M); -} - -static __inline__ void __DEFAULT_FN_ATTRS128 -_mm_mask_cvtusepi64_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A) -{ - __builtin_ia32_pmovusqw128mem_mask ((__v8hi *) __P, (__v2di) __A, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_cvtusepi64_epi16 (__m256i __A) -{ - return (__m128i) __builtin_ia32_pmovusqw256_mask ((__v4di) __A, - (__v8hi)_mm_undefined_si128(), - (__mmask8) -1); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtusepi64_epi16 (__m128i __O, __mmask8 __M, __m256i __A) -{ - return (__m128i) __builtin_ia32_pmovusqw256_mask ((__v4di) __A, - (__v8hi) __O, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtusepi64_epi16 (__mmask8 __M, __m256i __A) -{ - return (__m128i) __builtin_ia32_pmovusqw256_mask ((__v4di) __A, - (__v8hi) _mm_setzero_si128 (), - __M); -} - -static __inline__ void __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtusepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A) -{ - __builtin_ia32_pmovusqw256mem_mask ((__v8hi *) __P, (__v4di) __A, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_cvtepi32_epi8 (__m128i __A) -{ - return (__m128i)__builtin_shufflevector( - __builtin_convertvector((__v4si)__A, __v4qi), (__v4qi){0, 0, 0, 0}, 0, 1, - 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvtepi32_epi8 (__m128i __O, __mmask8 __M, __m128i __A) -{ - return (__m128i) __builtin_ia32_pmovdb128_mask ((__v4si) __A, - (__v16qi) __O, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtepi32_epi8 (__mmask8 __M, __m128i __A) -{ - return (__m128i) __builtin_ia32_pmovdb128_mask ((__v4si) __A, - (__v16qi) - _mm_setzero_si128 (), - __M); -} - -static __inline__ void __DEFAULT_FN_ATTRS128 -_mm_mask_cvtepi32_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A) -{ - __builtin_ia32_pmovdb128mem_mask ((__v16qi *) __P, (__v4si) __A, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_cvtepi32_epi8 (__m256i __A) -{ - return (__m128i)__builtin_shufflevector( - __builtin_convertvector((__v8si)__A, __v8qi), - (__v8qi){0, 0, 0, 0, 0, 0, 0, 0}, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, - 12, 13, 14, 15); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtepi32_epi8 (__m128i __O, __mmask8 __M, __m256i __A) -{ - return (__m128i) __builtin_ia32_pmovdb256_mask ((__v8si) __A, - (__v16qi) __O, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtepi32_epi8 (__mmask8 __M, __m256i __A) -{ - return (__m128i) __builtin_ia32_pmovdb256_mask ((__v8si) __A, - (__v16qi) _mm_setzero_si128 (), - __M); -} - -static __inline__ void __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtepi32_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A) -{ - __builtin_ia32_pmovdb256mem_mask ((__v16qi *) __P, (__v8si) __A, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_cvtepi32_epi16 (__m128i __A) -{ - return (__m128i)__builtin_shufflevector( - __builtin_convertvector((__v4si)__A, __v4hi), (__v4hi){0, 0, 0, 0}, 0, 1, - 2, 3, 4, 5, 6, 7); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvtepi32_epi16 (__m128i __O, __mmask8 __M, __m128i __A) -{ - return (__m128i) __builtin_ia32_pmovdw128_mask ((__v4si) __A, - (__v8hi) __O, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtepi32_epi16 (__mmask8 __M, __m128i __A) -{ - return (__m128i) __builtin_ia32_pmovdw128_mask ((__v4si) __A, - (__v8hi) _mm_setzero_si128 (), - __M); -} - -static __inline__ void __DEFAULT_FN_ATTRS128 -_mm_mask_cvtepi32_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A) -{ - __builtin_ia32_pmovdw128mem_mask ((__v8hi *) __P, (__v4si) __A, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_cvtepi32_epi16 (__m256i __A) -{ - return (__m128i)__builtin_convertvector((__v8si)__A, __v8hi); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtepi32_epi16 (__m128i __O, __mmask8 __M, __m256i __A) -{ - return (__m128i) __builtin_ia32_pmovdw256_mask ((__v8si) __A, - (__v8hi) __O, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtepi32_epi16 (__mmask8 __M, __m256i __A) -{ - return (__m128i) __builtin_ia32_pmovdw256_mask ((__v8si) __A, - (__v8hi) _mm_setzero_si128 (), - __M); -} - -static __inline__ void __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtepi32_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A) -{ - __builtin_ia32_pmovdw256mem_mask ((__v8hi *) __P, (__v8si) __A, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_cvtepi64_epi8 (__m128i __A) -{ - return (__m128i)__builtin_shufflevector( - __builtin_convertvector((__v2di)__A, __v2qi), (__v2qi){0, 0}, 0, 1, 2, 3, - 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m128i __A) -{ - return (__m128i) __builtin_ia32_pmovqb128_mask ((__v2di) __A, - (__v16qi) __O, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtepi64_epi8 (__mmask8 __M, __m128i __A) -{ - return (__m128i) __builtin_ia32_pmovqb128_mask ((__v2di) __A, - (__v16qi) _mm_setzero_si128 (), - __M); -} - -static __inline__ void __DEFAULT_FN_ATTRS128 -_mm_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m128i __A) -{ - __builtin_ia32_pmovqb128mem_mask ((__v16qi *) __P, (__v2di) __A, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_cvtepi64_epi8 (__m256i __A) -{ - return (__m128i)__builtin_shufflevector( - __builtin_convertvector((__v4di)__A, __v4qi), (__v4qi){0, 0, 0, 0}, 0, 1, - 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtepi64_epi8 (__m128i __O, __mmask8 __M, __m256i __A) -{ - return (__m128i) __builtin_ia32_pmovqb256_mask ((__v4di) __A, - (__v16qi) __O, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtepi64_epi8 (__mmask8 __M, __m256i __A) -{ - return (__m128i) __builtin_ia32_pmovqb256_mask ((__v4di) __A, - (__v16qi) _mm_setzero_si128 (), - __M); -} - -static __inline__ void __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtepi64_storeu_epi8 (void * __P, __mmask8 __M, __m256i __A) -{ - __builtin_ia32_pmovqb256mem_mask ((__v16qi *) __P, (__v4di) __A, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_cvtepi64_epi32 (__m128i __A) -{ - return (__m128i)__builtin_shufflevector( - __builtin_convertvector((__v2di)__A, __v2si), (__v2si){0, 0}, 0, 1, 2, 3); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvtepi64_epi32 (__m128i __O, __mmask8 __M, __m128i __A) -{ - return (__m128i) __builtin_ia32_pmovqd128_mask ((__v2di) __A, - (__v4si) __O, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtepi64_epi32 (__mmask8 __M, __m128i __A) -{ - return (__m128i) __builtin_ia32_pmovqd128_mask ((__v2di) __A, - (__v4si) _mm_setzero_si128 (), - __M); -} - -static __inline__ void __DEFAULT_FN_ATTRS128 -_mm_mask_cvtepi64_storeu_epi32 (void * __P, __mmask8 __M, __m128i __A) -{ - __builtin_ia32_pmovqd128mem_mask ((__v4si *) __P, (__v2di) __A, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_cvtepi64_epi32 (__m256i __A) -{ - return (__m128i)__builtin_convertvector((__v4di)__A, __v4si); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtepi64_epi32 (__m128i __O, __mmask8 __M, __m256i __A) -{ - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, - (__v4si)_mm256_cvtepi64_epi32(__A), - (__v4si)__O); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtepi64_epi32 (__mmask8 __M, __m256i __A) -{ - return (__m128i)__builtin_ia32_selectd_128((__mmask8)__M, - (__v4si)_mm256_cvtepi64_epi32(__A), - (__v4si)_mm_setzero_si128()); -} - -static __inline__ void __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtepi64_storeu_epi32 (void * __P, __mmask8 __M, __m256i __A) -{ - __builtin_ia32_pmovqd256mem_mask ((__v4si *) __P, (__v4di) __A, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_cvtepi64_epi16 (__m128i __A) -{ - return (__m128i)__builtin_shufflevector( - __builtin_convertvector((__v2di)__A, __v2hi), (__v2hi){0, 0}, 0, 1, 2, 3, - 3, 3, 3, 3); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m128i __A) -{ - return (__m128i) __builtin_ia32_pmovqw128_mask ((__v2di) __A, - (__v8hi)__O, - __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtepi64_epi16 (__mmask8 __M, __m128i __A) -{ - return (__m128i) __builtin_ia32_pmovqw128_mask ((__v2di) __A, - (__v8hi) _mm_setzero_si128 (), - __M); -} - -static __inline__ void __DEFAULT_FN_ATTRS128 -_mm_mask_cvtepi64_storeu_epi16 (void * __P, __mmask8 __M, __m128i __A) -{ - __builtin_ia32_pmovqw128mem_mask ((__v8hi *) __P, (__v2di) __A, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_cvtepi64_epi16 (__m256i __A) -{ - return (__m128i)__builtin_shufflevector( - __builtin_convertvector((__v4di)__A, __v4hi), (__v4hi){0, 0, 0, 0}, 0, 1, - 2, 3, 4, 5, 6, 7); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtepi64_epi16 (__m128i __O, __mmask8 __M, __m256i __A) -{ - return (__m128i) __builtin_ia32_pmovqw256_mask ((__v4di) __A, - (__v8hi) __O, __M); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtepi64_epi16 (__mmask8 __M, __m256i __A) -{ - return (__m128i) __builtin_ia32_pmovqw256_mask ((__v4di) __A, - (__v8hi) _mm_setzero_si128 (), - __M); -} - -static __inline__ void __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtepi64_storeu_epi16 (void * __P, __mmask8 __M, __m256i __A) -{ - __builtin_ia32_pmovqw256mem_mask ((__v8hi *) __P, (__v4di) __A, __M); -} - -#define _mm256_extractf32x4_ps(A, imm) \ - ((__m128)__builtin_ia32_extractf32x4_256_mask((__v8sf)(__m256)(A), \ - (int)(imm), \ - (__v4sf)_mm_undefined_ps(), \ - (__mmask8)-1)) - -#define _mm256_mask_extractf32x4_ps(W, U, A, imm) \ - ((__m128)__builtin_ia32_extractf32x4_256_mask((__v8sf)(__m256)(A), \ - (int)(imm), \ - (__v4sf)(__m128)(W), \ - (__mmask8)(U))) - -#define _mm256_maskz_extractf32x4_ps(U, A, imm) \ - ((__m128)__builtin_ia32_extractf32x4_256_mask((__v8sf)(__m256)(A), \ - (int)(imm), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U))) - -#define _mm256_extracti32x4_epi32(A, imm) \ - ((__m128i)__builtin_ia32_extracti32x4_256_mask((__v8si)(__m256i)(A), \ - (int)(imm), \ - (__v4si)_mm_undefined_si128(), \ - (__mmask8)-1)) - -#define _mm256_mask_extracti32x4_epi32(W, U, A, imm) \ - ((__m128i)__builtin_ia32_extracti32x4_256_mask((__v8si)(__m256i)(A), \ - (int)(imm), \ - (__v4si)(__m128i)(W), \ - (__mmask8)(U))) - -#define _mm256_maskz_extracti32x4_epi32(U, A, imm) \ - ((__m128i)__builtin_ia32_extracti32x4_256_mask((__v8si)(__m256i)(A), \ - (int)(imm), \ - (__v4si)_mm_setzero_si128(), \ - (__mmask8)(U))) - -#define _mm256_insertf32x4(A, B, imm) \ - ((__m256)__builtin_ia32_insertf32x4_256((__v8sf)(__m256)(A), \ - (__v4sf)(__m128)(B), (int)(imm))) - -#define _mm256_mask_insertf32x4(W, U, A, B, imm) \ - ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ - (__v8sf)_mm256_insertf32x4((A), (B), (imm)), \ - (__v8sf)(__m256)(W))) - -#define _mm256_maskz_insertf32x4(U, A, B, imm) \ - ((__m256)__builtin_ia32_selectps_256((__mmask8)(U), \ - (__v8sf)_mm256_insertf32x4((A), (B), (imm)), \ - (__v8sf)_mm256_setzero_ps())) - -#define _mm256_inserti32x4(A, B, imm) \ - ((__m256i)__builtin_ia32_inserti32x4_256((__v8si)(__m256i)(A), \ - (__v4si)(__m128i)(B), (int)(imm))) - -#define _mm256_mask_inserti32x4(W, U, A, B, imm) \ - ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ - (__v8si)_mm256_inserti32x4((A), (B), (imm)), \ - (__v8si)(__m256i)(W))) - -#define _mm256_maskz_inserti32x4(U, A, B, imm) \ - ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ - (__v8si)_mm256_inserti32x4((A), (B), (imm)), \ - (__v8si)_mm256_setzero_si256())) - -#define _mm_getmant_pd(A, B, C) \ - ((__m128d)__builtin_ia32_getmantpd128_mask((__v2df)(__m128d)(A), \ - (int)(((C)<<2) | (B)), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)-1)) - -#define _mm_mask_getmant_pd(W, U, A, B, C) \ - ((__m128d)__builtin_ia32_getmantpd128_mask((__v2df)(__m128d)(A), \ - (int)(((C)<<2) | (B)), \ - (__v2df)(__m128d)(W), \ - (__mmask8)(U))) - -#define _mm_maskz_getmant_pd(U, A, B, C) \ - ((__m128d)__builtin_ia32_getmantpd128_mask((__v2df)(__m128d)(A), \ - (int)(((C)<<2) | (B)), \ - (__v2df)_mm_setzero_pd(), \ - (__mmask8)(U))) - -#define _mm256_getmant_pd(A, B, C) \ - ((__m256d)__builtin_ia32_getmantpd256_mask((__v4df)(__m256d)(A), \ - (int)(((C)<<2) | (B)), \ - (__v4df)_mm256_setzero_pd(), \ - (__mmask8)-1)) - -#define _mm256_mask_getmant_pd(W, U, A, B, C) \ - ((__m256d)__builtin_ia32_getmantpd256_mask((__v4df)(__m256d)(A), \ - (int)(((C)<<2) | (B)), \ - (__v4df)(__m256d)(W), \ - (__mmask8)(U))) - -#define _mm256_maskz_getmant_pd(U, A, B, C) \ - ((__m256d)__builtin_ia32_getmantpd256_mask((__v4df)(__m256d)(A), \ - (int)(((C)<<2) | (B)), \ - (__v4df)_mm256_setzero_pd(), \ - (__mmask8)(U))) - -#define _mm_getmant_ps(A, B, C) \ - ((__m128)__builtin_ia32_getmantps128_mask((__v4sf)(__m128)(A), \ - (int)(((C)<<2) | (B)), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)-1)) - -#define _mm_mask_getmant_ps(W, U, A, B, C) \ - ((__m128)__builtin_ia32_getmantps128_mask((__v4sf)(__m128)(A), \ - (int)(((C)<<2) | (B)), \ - (__v4sf)(__m128)(W), \ - (__mmask8)(U))) - -#define _mm_maskz_getmant_ps(U, A, B, C) \ - ((__m128)__builtin_ia32_getmantps128_mask((__v4sf)(__m128)(A), \ - (int)(((C)<<2) | (B)), \ - (__v4sf)_mm_setzero_ps(), \ - (__mmask8)(U))) - -#define _mm256_getmant_ps(A, B, C) \ - ((__m256)__builtin_ia32_getmantps256_mask((__v8sf)(__m256)(A), \ - (int)(((C)<<2) | (B)), \ - (__v8sf)_mm256_setzero_ps(), \ - (__mmask8)-1)) - -#define _mm256_mask_getmant_ps(W, U, A, B, C) \ - ((__m256)__builtin_ia32_getmantps256_mask((__v8sf)(__m256)(A), \ - (int)(((C)<<2) | (B)), \ - (__v8sf)(__m256)(W), \ - (__mmask8)(U))) - -#define _mm256_maskz_getmant_ps(U, A, B, C) \ - ((__m256)__builtin_ia32_getmantps256_mask((__v8sf)(__m256)(A), \ - (int)(((C)<<2) | (B)), \ - (__v8sf)_mm256_setzero_ps(), \ - (__mmask8)(U))) - -#define _mm_mmask_i64gather_pd(v1_old, mask, index, addr, scale) \ - ((__m128d)__builtin_ia32_gather3div2df((__v2df)(__m128d)(v1_old), \ - (void const *)(addr), \ - (__v2di)(__m128i)(index), \ - (__mmask8)(mask), (int)(scale))) - -#define _mm_mmask_i64gather_epi64(v1_old, mask, index, addr, scale) \ - ((__m128i)__builtin_ia32_gather3div2di((__v2di)(__m128i)(v1_old), \ - (void const *)(addr), \ - (__v2di)(__m128i)(index), \ - (__mmask8)(mask), (int)(scale))) - -#define _mm256_mmask_i64gather_pd(v1_old, mask, index, addr, scale) \ - ((__m256d)__builtin_ia32_gather3div4df((__v4df)(__m256d)(v1_old), \ - (void const *)(addr), \ - (__v4di)(__m256i)(index), \ - (__mmask8)(mask), (int)(scale))) - -#define _mm256_mmask_i64gather_epi64(v1_old, mask, index, addr, scale) \ - ((__m256i)__builtin_ia32_gather3div4di((__v4di)(__m256i)(v1_old), \ - (void const *)(addr), \ - (__v4di)(__m256i)(index), \ - (__mmask8)(mask), (int)(scale))) - -#define _mm_mmask_i64gather_ps(v1_old, mask, index, addr, scale) \ - ((__m128)__builtin_ia32_gather3div4sf((__v4sf)(__m128)(v1_old), \ - (void const *)(addr), \ - (__v2di)(__m128i)(index), \ - (__mmask8)(mask), (int)(scale))) - -#define _mm_mmask_i64gather_epi32(v1_old, mask, index, addr, scale) \ - ((__m128i)__builtin_ia32_gather3div4si((__v4si)(__m128i)(v1_old), \ - (void const *)(addr), \ - (__v2di)(__m128i)(index), \ - (__mmask8)(mask), (int)(scale))) - -#define _mm256_mmask_i64gather_ps(v1_old, mask, index, addr, scale) \ - ((__m128)__builtin_ia32_gather3div8sf((__v4sf)(__m128)(v1_old), \ - (void const *)(addr), \ - (__v4di)(__m256i)(index), \ - (__mmask8)(mask), (int)(scale))) - -#define _mm256_mmask_i64gather_epi32(v1_old, mask, index, addr, scale) \ - ((__m128i)__builtin_ia32_gather3div8si((__v4si)(__m128i)(v1_old), \ - (void const *)(addr), \ - (__v4di)(__m256i)(index), \ - (__mmask8)(mask), (int)(scale))) - -#define _mm_mmask_i32gather_pd(v1_old, mask, index, addr, scale) \ - ((__m128d)__builtin_ia32_gather3siv2df((__v2df)(__m128d)(v1_old), \ - (void const *)(addr), \ - (__v4si)(__m128i)(index), \ - (__mmask8)(mask), (int)(scale))) - -#define _mm_mmask_i32gather_epi64(v1_old, mask, index, addr, scale) \ - ((__m128i)__builtin_ia32_gather3siv2di((__v2di)(__m128i)(v1_old), \ - (void const *)(addr), \ - (__v4si)(__m128i)(index), \ - (__mmask8)(mask), (int)(scale))) - -#define _mm256_mmask_i32gather_pd(v1_old, mask, index, addr, scale) \ - ((__m256d)__builtin_ia32_gather3siv4df((__v4df)(__m256d)(v1_old), \ - (void const *)(addr), \ - (__v4si)(__m128i)(index), \ - (__mmask8)(mask), (int)(scale))) - -#define _mm256_mmask_i32gather_epi64(v1_old, mask, index, addr, scale) \ - ((__m256i)__builtin_ia32_gather3siv4di((__v4di)(__m256i)(v1_old), \ - (void const *)(addr), \ - (__v4si)(__m128i)(index), \ - (__mmask8)(mask), (int)(scale))) - -#define _mm_mmask_i32gather_ps(v1_old, mask, index, addr, scale) \ - ((__m128)__builtin_ia32_gather3siv4sf((__v4sf)(__m128)(v1_old), \ - (void const *)(addr), \ - (__v4si)(__m128i)(index), \ - (__mmask8)(mask), (int)(scale))) - -#define _mm_mmask_i32gather_epi32(v1_old, mask, index, addr, scale) \ - ((__m128i)__builtin_ia32_gather3siv4si((__v4si)(__m128i)(v1_old), \ - (void const *)(addr), \ - (__v4si)(__m128i)(index), \ - (__mmask8)(mask), (int)(scale))) - -#define _mm256_mmask_i32gather_ps(v1_old, mask, index, addr, scale) \ - ((__m256)__builtin_ia32_gather3siv8sf((__v8sf)(__m256)(v1_old), \ - (void const *)(addr), \ - (__v8si)(__m256i)(index), \ - (__mmask8)(mask), (int)(scale))) - -#define _mm256_mmask_i32gather_epi32(v1_old, mask, index, addr, scale) \ - ((__m256i)__builtin_ia32_gather3siv8si((__v8si)(__m256i)(v1_old), \ - (void const *)(addr), \ - (__v8si)(__m256i)(index), \ - (__mmask8)(mask), (int)(scale))) - -#define _mm256_permutex_pd(X, C) \ - ((__m256d)__builtin_ia32_permdf256((__v4df)(__m256d)(X), (int)(C))) - -#define _mm256_mask_permutex_pd(W, U, X, C) \ - ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ - (__v4df)_mm256_permutex_pd((X), (C)), \ - (__v4df)(__m256d)(W))) - -#define _mm256_maskz_permutex_pd(U, X, C) \ - ((__m256d)__builtin_ia32_selectpd_256((__mmask8)(U), \ - (__v4df)_mm256_permutex_pd((X), (C)), \ - (__v4df)_mm256_setzero_pd())) - -#define _mm256_permutex_epi64(X, C) \ - ((__m256i)__builtin_ia32_permdi256((__v4di)(__m256i)(X), (int)(C))) - -#define _mm256_mask_permutex_epi64(W, U, X, C) \ - ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ - (__v4di)_mm256_permutex_epi64((X), (C)), \ - (__v4di)(__m256i)(W))) - -#define _mm256_maskz_permutex_epi64(U, X, C) \ - ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ - (__v4di)_mm256_permutex_epi64((X), (C)), \ - (__v4di)_mm256_setzero_si256())) - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_permutexvar_pd (__m256i __X, __m256d __Y) -{ - return (__m256d)__builtin_ia32_permvardf256((__v4df)__Y, (__v4di)__X); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_mask_permutexvar_pd (__m256d __W, __mmask8 __U, __m256i __X, - __m256d __Y) -{ - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_permutexvar_pd(__X, __Y), - (__v4df)__W); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_maskz_permutexvar_pd (__mmask8 __U, __m256i __X, __m256d __Y) -{ - return (__m256d)__builtin_ia32_selectpd_256((__mmask8)__U, - (__v4df)_mm256_permutexvar_pd(__X, __Y), - (__v4df)_mm256_setzero_pd()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_permutexvar_epi64 ( __m256i __X, __m256i __Y) -{ - return (__m256i)__builtin_ia32_permvardi256((__v4di) __Y, (__v4di) __X); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_permutexvar_epi64 (__mmask8 __M, __m256i __X, __m256i __Y) -{ - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, - (__v4di)_mm256_permutexvar_epi64(__X, __Y), - (__v4di)_mm256_setzero_si256()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_permutexvar_epi64 (__m256i __W, __mmask8 __M, __m256i __X, - __m256i __Y) -{ - return (__m256i)__builtin_ia32_selectq_256((__mmask8)__M, - (__v4di)_mm256_permutexvar_epi64(__X, __Y), - (__v4di)__W); -} - -#define _mm256_permutexvar_ps(A, B) _mm256_permutevar8x32_ps((B), (A)) - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_mask_permutexvar_ps(__m256 __W, __mmask8 __U, __m256i __X, __m256 __Y) -{ - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_permutexvar_ps(__X, __Y), - (__v8sf)__W); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_maskz_permutexvar_ps(__mmask8 __U, __m256i __X, __m256 __Y) -{ - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_permutexvar_ps(__X, __Y), - (__v8sf)_mm256_setzero_ps()); -} - -#define _mm256_permutexvar_epi32(A, B) _mm256_permutevar8x32_epi32((B), (A)) - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_permutexvar_epi32(__m256i __W, __mmask8 __M, __m256i __X, - __m256i __Y) -{ - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, - (__v8si)_mm256_permutexvar_epi32(__X, __Y), - (__v8si)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_permutexvar_epi32(__mmask8 __M, __m256i __X, __m256i __Y) -{ - return (__m256i)__builtin_ia32_selectd_256((__mmask8)__M, - (__v8si)_mm256_permutexvar_epi32(__X, __Y), - (__v8si)_mm256_setzero_si256()); -} - -#define _mm_alignr_epi32(A, B, imm) \ - ((__m128i)__builtin_ia32_alignd128((__v4si)(__m128i)(A), \ - (__v4si)(__m128i)(B), (int)(imm))) - -#define _mm_mask_alignr_epi32(W, U, A, B, imm) \ - ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ - (__v4si)_mm_alignr_epi32((A), (B), (imm)), \ - (__v4si)(__m128i)(W))) - -#define _mm_maskz_alignr_epi32(U, A, B, imm) \ - ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ - (__v4si)_mm_alignr_epi32((A), (B), (imm)), \ - (__v4si)_mm_setzero_si128())) - -#define _mm256_alignr_epi32(A, B, imm) \ - ((__m256i)__builtin_ia32_alignd256((__v8si)(__m256i)(A), \ - (__v8si)(__m256i)(B), (int)(imm))) - -#define _mm256_mask_alignr_epi32(W, U, A, B, imm) \ - ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ - (__v8si)_mm256_alignr_epi32((A), (B), (imm)), \ - (__v8si)(__m256i)(W))) - -#define _mm256_maskz_alignr_epi32(U, A, B, imm) \ - ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ - (__v8si)_mm256_alignr_epi32((A), (B), (imm)), \ - (__v8si)_mm256_setzero_si256())) - -#define _mm_alignr_epi64(A, B, imm) \ - ((__m128i)__builtin_ia32_alignq128((__v2di)(__m128i)(A), \ - (__v2di)(__m128i)(B), (int)(imm))) - -#define _mm_mask_alignr_epi64(W, U, A, B, imm) \ - ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \ - (__v2di)_mm_alignr_epi64((A), (B), (imm)), \ - (__v2di)(__m128i)(W))) - -#define _mm_maskz_alignr_epi64(U, A, B, imm) \ - ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \ - (__v2di)_mm_alignr_epi64((A), (B), (imm)), \ - (__v2di)_mm_setzero_si128())) - -#define _mm256_alignr_epi64(A, B, imm) \ - ((__m256i)__builtin_ia32_alignq256((__v4di)(__m256i)(A), \ - (__v4di)(__m256i)(B), (int)(imm))) - -#define _mm256_mask_alignr_epi64(W, U, A, B, imm) \ - ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ - (__v4di)_mm256_alignr_epi64((A), (B), (imm)), \ - (__v4di)(__m256i)(W))) - -#define _mm256_maskz_alignr_epi64(U, A, B, imm) \ - ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ - (__v4di)_mm256_alignr_epi64((A), (B), (imm)), \ - (__v4di)_mm256_setzero_si256())) - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_movehdup_ps (__m128 __W, __mmask8 __U, __m128 __A) -{ - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_movehdup_ps(__A), - (__v4sf)__W); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_movehdup_ps (__mmask8 __U, __m128 __A) -{ - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_movehdup_ps(__A), - (__v4sf)_mm_setzero_ps()); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_mask_movehdup_ps (__m256 __W, __mmask8 __U, __m256 __A) -{ - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_movehdup_ps(__A), - (__v8sf)__W); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_maskz_movehdup_ps (__mmask8 __U, __m256 __A) -{ - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_movehdup_ps(__A), - (__v8sf)_mm256_setzero_ps()); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_moveldup_ps (__m128 __W, __mmask8 __U, __m128 __A) -{ - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_moveldup_ps(__A), - (__v4sf)__W); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_moveldup_ps (__mmask8 __U, __m128 __A) -{ - return (__m128)__builtin_ia32_selectps_128((__mmask8)__U, - (__v4sf)_mm_moveldup_ps(__A), - (__v4sf)_mm_setzero_ps()); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_mask_moveldup_ps (__m256 __W, __mmask8 __U, __m256 __A) -{ - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_moveldup_ps(__A), - (__v8sf)__W); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_maskz_moveldup_ps (__mmask8 __U, __m256 __A) -{ - return (__m256)__builtin_ia32_selectps_256((__mmask8)__U, - (__v8sf)_mm256_moveldup_ps(__A), - (__v8sf)_mm256_setzero_ps()); -} - -#define _mm256_mask_shuffle_epi32(W, U, A, I) \ - ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ - (__v8si)_mm256_shuffle_epi32((A), (I)), \ - (__v8si)(__m256i)(W))) - -#define _mm256_maskz_shuffle_epi32(U, A, I) \ - ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ - (__v8si)_mm256_shuffle_epi32((A), (I)), \ - (__v8si)_mm256_setzero_si256())) - -#define _mm_mask_shuffle_epi32(W, U, A, I) \ - ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ - (__v4si)_mm_shuffle_epi32((A), (I)), \ - (__v4si)(__m128i)(W))) - -#define _mm_maskz_shuffle_epi32(U, A, I) \ - ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ - (__v4si)_mm_shuffle_epi32((A), (I)), \ - (__v4si)_mm_setzero_si128())) - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_mask_mov_pd (__m128d __W, __mmask8 __U, __m128d __A) -{ - return (__m128d) __builtin_ia32_selectpd_128 ((__mmask8) __U, - (__v2df) __A, - (__v2df) __W); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maskz_mov_pd (__mmask8 __U, __m128d __A) -{ - return (__m128d) __builtin_ia32_selectpd_128 ((__mmask8) __U, - (__v2df) __A, - (__v2df) _mm_setzero_pd ()); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_mask_mov_pd (__m256d __W, __mmask8 __U, __m256d __A) -{ - return (__m256d) __builtin_ia32_selectpd_256 ((__mmask8) __U, - (__v4df) __A, - (__v4df) __W); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_maskz_mov_pd (__mmask8 __U, __m256d __A) -{ - return (__m256d) __builtin_ia32_selectpd_256 ((__mmask8) __U, - (__v4df) __A, - (__v4df) _mm256_setzero_pd ()); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_mov_ps (__m128 __W, __mmask8 __U, __m128 __A) -{ - return (__m128) __builtin_ia32_selectps_128 ((__mmask8) __U, - (__v4sf) __A, - (__v4sf) __W); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_mov_ps (__mmask8 __U, __m128 __A) -{ - return (__m128) __builtin_ia32_selectps_128 ((__mmask8) __U, - (__v4sf) __A, - (__v4sf) _mm_setzero_ps ()); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_mask_mov_ps (__m256 __W, __mmask8 __U, __m256 __A) -{ - return (__m256) __builtin_ia32_selectps_256 ((__mmask8) __U, - (__v8sf) __A, - (__v8sf) __W); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_maskz_mov_ps (__mmask8 __U, __m256 __A) -{ - return (__m256) __builtin_ia32_selectps_256 ((__mmask8) __U, - (__v8sf) __A, - (__v8sf) _mm256_setzero_ps ()); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_mask_cvtph_ps (__m128 __W, __mmask8 __U, __m128i __A) -{ - return (__m128) __builtin_ia32_vcvtph2ps_mask ((__v8hi) __A, - (__v4sf) __W, - (__mmask8) __U); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maskz_cvtph_ps (__mmask8 __U, __m128i __A) -{ - return (__m128) __builtin_ia32_vcvtph2ps_mask ((__v8hi) __A, - (__v4sf) - _mm_setzero_ps (), - (__mmask8) __U); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_mask_cvtph_ps (__m256 __W, __mmask8 __U, __m128i __A) -{ - return (__m256) __builtin_ia32_vcvtph2ps256_mask ((__v8hi) __A, - (__v8sf) __W, - (__mmask8) __U); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_maskz_cvtph_ps (__mmask8 __U, __m128i __A) -{ - return (__m256) __builtin_ia32_vcvtph2ps256_mask ((__v8hi) __A, - (__v8sf) - _mm256_setzero_ps (), - (__mmask8) __U); -} - -#define _mm_mask_cvt_roundps_ph(W, U, A, I) \ - ((__m128i)__builtin_ia32_vcvtps2ph_mask((__v4sf)(__m128)(A), (int)(I), \ - (__v8hi)(__m128i)(W), \ - (__mmask8)(U))) - -#define _mm_maskz_cvt_roundps_ph(U, A, I) \ - ((__m128i)__builtin_ia32_vcvtps2ph_mask((__v4sf)(__m128)(A), (int)(I), \ - (__v8hi)_mm_setzero_si128(), \ - (__mmask8)(U))) - -#define _mm_mask_cvtps_ph _mm_mask_cvt_roundps_ph -#define _mm_maskz_cvtps_ph _mm_maskz_cvt_roundps_ph - -#define _mm256_mask_cvt_roundps_ph(W, U, A, I) \ - ((__m128i)__builtin_ia32_vcvtps2ph256_mask((__v8sf)(__m256)(A), (int)(I), \ - (__v8hi)(__m128i)(W), \ - (__mmask8)(U))) - -#define _mm256_maskz_cvt_roundps_ph(U, A, I) \ - ((__m128i)__builtin_ia32_vcvtps2ph256_mask((__v8sf)(__m256)(A), (int)(I), \ - (__v8hi)_mm_setzero_si128(), \ - (__mmask8)(U))) - -#define _mm256_mask_cvtps_ph _mm256_mask_cvt_roundps_ph -#define _mm256_maskz_cvtps_ph _mm256_maskz_cvt_roundps_ph - - -#undef __DEFAULT_FN_ATTRS128 -#undef __DEFAULT_FN_ATTRS256 - -#endif /* __AVX512VLINTRIN_H */ diff --git a/include/avx512vlvbmi2intrin.h b/include/avx512vlvbmi2intrin.h deleted file mode 100644 index fac1f23..0000000 --- a/include/avx512vlvbmi2intrin.h +++ /dev/null @@ -1,689 +0,0 @@ -/*===------------- avx512vlvbmi2intrin.h - VBMI2 intrinsics -----------------=== - * - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ -#ifndef __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __AVX512VLVBMI2INTRIN_H -#define __AVX512VLVBMI2INTRIN_H - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vbmi2"), __min_vector_width__(128))) -#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vbmi2"), __min_vector_width__(256))) - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_compress_epi16(__m128i __S, __mmask8 __U, __m128i __D) -{ - return (__m128i) __builtin_ia32_compresshi128_mask ((__v8hi) __D, - (__v8hi) __S, - __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_compress_epi16(__mmask8 __U, __m128i __D) -{ - return (__m128i) __builtin_ia32_compresshi128_mask ((__v8hi) __D, - (__v8hi) _mm_setzero_si128(), - __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_compress_epi8(__m128i __S, __mmask16 __U, __m128i __D) -{ - return (__m128i) __builtin_ia32_compressqi128_mask ((__v16qi) __D, - (__v16qi) __S, - __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_compress_epi8(__mmask16 __U, __m128i __D) -{ - return (__m128i) __builtin_ia32_compressqi128_mask ((__v16qi) __D, - (__v16qi) _mm_setzero_si128(), - __U); -} - -static __inline__ void __DEFAULT_FN_ATTRS128 -_mm_mask_compressstoreu_epi16(void *__P, __mmask8 __U, __m128i __D) -{ - __builtin_ia32_compressstorehi128_mask ((__v8hi *) __P, (__v8hi) __D, - __U); -} - -static __inline__ void __DEFAULT_FN_ATTRS128 -_mm_mask_compressstoreu_epi8(void *__P, __mmask16 __U, __m128i __D) -{ - __builtin_ia32_compressstoreqi128_mask ((__v16qi *) __P, (__v16qi) __D, - __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_expand_epi16(__m128i __S, __mmask8 __U, __m128i __D) -{ - return (__m128i) __builtin_ia32_expandhi128_mask ((__v8hi) __D, - (__v8hi) __S, - __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_expand_epi16(__mmask8 __U, __m128i __D) -{ - return (__m128i) __builtin_ia32_expandhi128_mask ((__v8hi) __D, - (__v8hi) _mm_setzero_si128(), - __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_expand_epi8(__m128i __S, __mmask16 __U, __m128i __D) -{ - return (__m128i) __builtin_ia32_expandqi128_mask ((__v16qi) __D, - (__v16qi) __S, - __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_expand_epi8(__mmask16 __U, __m128i __D) -{ - return (__m128i) __builtin_ia32_expandqi128_mask ((__v16qi) __D, - (__v16qi) _mm_setzero_si128(), - __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_expandloadu_epi16(__m128i __S, __mmask8 __U, void const *__P) -{ - return (__m128i) __builtin_ia32_expandloadhi128_mask ((const __v8hi *)__P, - (__v8hi) __S, - __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_expandloadu_epi16(__mmask8 __U, void const *__P) -{ - return (__m128i) __builtin_ia32_expandloadhi128_mask ((const __v8hi *)__P, - (__v8hi) _mm_setzero_si128(), - __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_expandloadu_epi8(__m128i __S, __mmask16 __U, void const *__P) -{ - return (__m128i) __builtin_ia32_expandloadqi128_mask ((const __v16qi *)__P, - (__v16qi) __S, - __U); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_expandloadu_epi8(__mmask16 __U, void const *__P) -{ - return (__m128i) __builtin_ia32_expandloadqi128_mask ((const __v16qi *)__P, - (__v16qi) _mm_setzero_si128(), - __U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_compress_epi16(__m256i __S, __mmask16 __U, __m256i __D) -{ - return (__m256i) __builtin_ia32_compresshi256_mask ((__v16hi) __D, - (__v16hi) __S, - __U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_compress_epi16(__mmask16 __U, __m256i __D) -{ - return (__m256i) __builtin_ia32_compresshi256_mask ((__v16hi) __D, - (__v16hi) _mm256_setzero_si256(), - __U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_compress_epi8(__m256i __S, __mmask32 __U, __m256i __D) -{ - return (__m256i) __builtin_ia32_compressqi256_mask ((__v32qi) __D, - (__v32qi) __S, - __U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_compress_epi8(__mmask32 __U, __m256i __D) -{ - return (__m256i) __builtin_ia32_compressqi256_mask ((__v32qi) __D, - (__v32qi) _mm256_setzero_si256(), - __U); -} - -static __inline__ void __DEFAULT_FN_ATTRS256 -_mm256_mask_compressstoreu_epi16(void *__P, __mmask16 __U, __m256i __D) -{ - __builtin_ia32_compressstorehi256_mask ((__v16hi *) __P, (__v16hi) __D, - __U); -} - -static __inline__ void __DEFAULT_FN_ATTRS256 -_mm256_mask_compressstoreu_epi8(void *__P, __mmask32 __U, __m256i __D) -{ - __builtin_ia32_compressstoreqi256_mask ((__v32qi *) __P, (__v32qi) __D, - __U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_expand_epi16(__m256i __S, __mmask16 __U, __m256i __D) -{ - return (__m256i) __builtin_ia32_expandhi256_mask ((__v16hi) __D, - (__v16hi) __S, - __U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_expand_epi16(__mmask16 __U, __m256i __D) -{ - return (__m256i) __builtin_ia32_expandhi256_mask ((__v16hi) __D, - (__v16hi) _mm256_setzero_si256(), - __U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_expand_epi8(__m256i __S, __mmask32 __U, __m256i __D) -{ - return (__m256i) __builtin_ia32_expandqi256_mask ((__v32qi) __D, - (__v32qi) __S, - __U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_expand_epi8(__mmask32 __U, __m256i __D) -{ - return (__m256i) __builtin_ia32_expandqi256_mask ((__v32qi) __D, - (__v32qi) _mm256_setzero_si256(), - __U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_expandloadu_epi16(__m256i __S, __mmask16 __U, void const *__P) -{ - return (__m256i) __builtin_ia32_expandloadhi256_mask ((const __v16hi *)__P, - (__v16hi) __S, - __U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_expandloadu_epi16(__mmask16 __U, void const *__P) -{ - return (__m256i) __builtin_ia32_expandloadhi256_mask ((const __v16hi *)__P, - (__v16hi) _mm256_setzero_si256(), - __U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_expandloadu_epi8(__m256i __S, __mmask32 __U, void const *__P) -{ - return (__m256i) __builtin_ia32_expandloadqi256_mask ((const __v32qi *)__P, - (__v32qi) __S, - __U); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_expandloadu_epi8(__mmask32 __U, void const *__P) -{ - return (__m256i) __builtin_ia32_expandloadqi256_mask ((const __v32qi *)__P, - (__v32qi) _mm256_setzero_si256(), - __U); -} - -#define _mm256_shldi_epi64(A, B, I) \ - ((__m256i)__builtin_ia32_vpshldq256((__v4di)(__m256i)(A), \ - (__v4di)(__m256i)(B), (int)(I))) - -#define _mm256_mask_shldi_epi64(S, U, A, B, I) \ - ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ - (__v4di)_mm256_shldi_epi64((A), (B), (I)), \ - (__v4di)(__m256i)(S))) - -#define _mm256_maskz_shldi_epi64(U, A, B, I) \ - ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ - (__v4di)_mm256_shldi_epi64((A), (B), (I)), \ - (__v4di)_mm256_setzero_si256())) - -#define _mm_shldi_epi64(A, B, I) \ - ((__m128i)__builtin_ia32_vpshldq128((__v2di)(__m128i)(A), \ - (__v2di)(__m128i)(B), (int)(I))) - -#define _mm_mask_shldi_epi64(S, U, A, B, I) \ - ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \ - (__v2di)_mm_shldi_epi64((A), (B), (I)), \ - (__v2di)(__m128i)(S))) - -#define _mm_maskz_shldi_epi64(U, A, B, I) \ - ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \ - (__v2di)_mm_shldi_epi64((A), (B), (I)), \ - (__v2di)_mm_setzero_si128())) - -#define _mm256_shldi_epi32(A, B, I) \ - ((__m256i)__builtin_ia32_vpshldd256((__v8si)(__m256i)(A), \ - (__v8si)(__m256i)(B), (int)(I))) - -#define _mm256_mask_shldi_epi32(S, U, A, B, I) \ - ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ - (__v8si)_mm256_shldi_epi32((A), (B), (I)), \ - (__v8si)(__m256i)(S))) - -#define _mm256_maskz_shldi_epi32(U, A, B, I) \ - ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ - (__v8si)_mm256_shldi_epi32((A), (B), (I)), \ - (__v8si)_mm256_setzero_si256())) - -#define _mm_shldi_epi32(A, B, I) \ - ((__m128i)__builtin_ia32_vpshldd128((__v4si)(__m128i)(A), \ - (__v4si)(__m128i)(B), (int)(I))) - -#define _mm_mask_shldi_epi32(S, U, A, B, I) \ - ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ - (__v4si)_mm_shldi_epi32((A), (B), (I)), \ - (__v4si)(__m128i)(S))) - -#define _mm_maskz_shldi_epi32(U, A, B, I) \ - ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ - (__v4si)_mm_shldi_epi32((A), (B), (I)), \ - (__v4si)_mm_setzero_si128())) - -#define _mm256_shldi_epi16(A, B, I) \ - ((__m256i)__builtin_ia32_vpshldw256((__v16hi)(__m256i)(A), \ - (__v16hi)(__m256i)(B), (int)(I))) - -#define _mm256_mask_shldi_epi16(S, U, A, B, I) \ - ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ - (__v16hi)_mm256_shldi_epi16((A), (B), (I)), \ - (__v16hi)(__m256i)(S))) - -#define _mm256_maskz_shldi_epi16(U, A, B, I) \ - ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ - (__v16hi)_mm256_shldi_epi16((A), (B), (I)), \ - (__v16hi)_mm256_setzero_si256())) - -#define _mm_shldi_epi16(A, B, I) \ - ((__m128i)__builtin_ia32_vpshldw128((__v8hi)(__m128i)(A), \ - (__v8hi)(__m128i)(B), (int)(I))) - -#define _mm_mask_shldi_epi16(S, U, A, B, I) \ - ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ - (__v8hi)_mm_shldi_epi16((A), (B), (I)), \ - (__v8hi)(__m128i)(S))) - -#define _mm_maskz_shldi_epi16(U, A, B, I) \ - ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ - (__v8hi)_mm_shldi_epi16((A), (B), (I)), \ - (__v8hi)_mm_setzero_si128())) - -#define _mm256_shrdi_epi64(A, B, I) \ - ((__m256i)__builtin_ia32_vpshrdq256((__v4di)(__m256i)(A), \ - (__v4di)(__m256i)(B), (int)(I))) - -#define _mm256_mask_shrdi_epi64(S, U, A, B, I) \ - ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ - (__v4di)_mm256_shrdi_epi64((A), (B), (I)), \ - (__v4di)(__m256i)(S))) - -#define _mm256_maskz_shrdi_epi64(U, A, B, I) \ - ((__m256i)__builtin_ia32_selectq_256((__mmask8)(U), \ - (__v4di)_mm256_shrdi_epi64((A), (B), (I)), \ - (__v4di)_mm256_setzero_si256())) - -#define _mm_shrdi_epi64(A, B, I) \ - ((__m128i)__builtin_ia32_vpshrdq128((__v2di)(__m128i)(A), \ - (__v2di)(__m128i)(B), (int)(I))) - -#define _mm_mask_shrdi_epi64(S, U, A, B, I) \ - ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \ - (__v2di)_mm_shrdi_epi64((A), (B), (I)), \ - (__v2di)(__m128i)(S))) - -#define _mm_maskz_shrdi_epi64(U, A, B, I) \ - ((__m128i)__builtin_ia32_selectq_128((__mmask8)(U), \ - (__v2di)_mm_shrdi_epi64((A), (B), (I)), \ - (__v2di)_mm_setzero_si128())) - -#define _mm256_shrdi_epi32(A, B, I) \ - ((__m256i)__builtin_ia32_vpshrdd256((__v8si)(__m256i)(A), \ - (__v8si)(__m256i)(B), (int)(I))) - -#define _mm256_mask_shrdi_epi32(S, U, A, B, I) \ - ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ - (__v8si)_mm256_shrdi_epi32((A), (B), (I)), \ - (__v8si)(__m256i)(S))) - -#define _mm256_maskz_shrdi_epi32(U, A, B, I) \ - ((__m256i)__builtin_ia32_selectd_256((__mmask8)(U), \ - (__v8si)_mm256_shrdi_epi32((A), (B), (I)), \ - (__v8si)_mm256_setzero_si256())) - -#define _mm_shrdi_epi32(A, B, I) \ - ((__m128i)__builtin_ia32_vpshrdd128((__v4si)(__m128i)(A), \ - (__v4si)(__m128i)(B), (int)(I))) - -#define _mm_mask_shrdi_epi32(S, U, A, B, I) \ - ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ - (__v4si)_mm_shrdi_epi32((A), (B), (I)), \ - (__v4si)(__m128i)(S))) - -#define _mm_maskz_shrdi_epi32(U, A, B, I) \ - ((__m128i)__builtin_ia32_selectd_128((__mmask8)(U), \ - (__v4si)_mm_shrdi_epi32((A), (B), (I)), \ - (__v4si)_mm_setzero_si128())) - -#define _mm256_shrdi_epi16(A, B, I) \ - ((__m256i)__builtin_ia32_vpshrdw256((__v16hi)(__m256i)(A), \ - (__v16hi)(__m256i)(B), (int)(I))) - -#define _mm256_mask_shrdi_epi16(S, U, A, B, I) \ - ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ - (__v16hi)_mm256_shrdi_epi16((A), (B), (I)), \ - (__v16hi)(__m256i)(S))) - -#define _mm256_maskz_shrdi_epi16(U, A, B, I) \ - ((__m256i)__builtin_ia32_selectw_256((__mmask16)(U), \ - (__v16hi)_mm256_shrdi_epi16((A), (B), (I)), \ - (__v16hi)_mm256_setzero_si256())) - -#define _mm_shrdi_epi16(A, B, I) \ - ((__m128i)__builtin_ia32_vpshrdw128((__v8hi)(__m128i)(A), \ - (__v8hi)(__m128i)(B), (int)(I))) - -#define _mm_mask_shrdi_epi16(S, U, A, B, I) \ - ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ - (__v8hi)_mm_shrdi_epi16((A), (B), (I)), \ - (__v8hi)(__m128i)(S))) - -#define _mm_maskz_shrdi_epi16(U, A, B, I) \ - ((__m128i)__builtin_ia32_selectw_128((__mmask8)(U), \ - (__v8hi)_mm_shrdi_epi16((A), (B), (I)), \ - (__v8hi)_mm_setzero_si128())) - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_shldv_epi64(__m256i __A, __m256i __B, __m256i __C) -{ - return (__m256i)__builtin_ia32_vpshldvq256((__v4di)__A, (__v4di)__B, - (__v4di)__C); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_shldv_epi64(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) -{ - return (__m256i)__builtin_ia32_selectq_256(__U, - (__v4di)_mm256_shldv_epi64(__A, __B, __C), - (__v4di)__A); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_shldv_epi64(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) -{ - return (__m256i)__builtin_ia32_selectq_256(__U, - (__v4di)_mm256_shldv_epi64(__A, __B, __C), - (__v4di)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_shldv_epi64(__m128i __A, __m128i __B, __m128i __C) -{ - return (__m128i)__builtin_ia32_vpshldvq128((__v2di)__A, (__v2di)__B, - (__v2di)__C); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_shldv_epi64(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) -{ - return (__m128i)__builtin_ia32_selectq_128(__U, - (__v2di)_mm_shldv_epi64(__A, __B, __C), - (__v2di)__A); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_shldv_epi64(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) -{ - return (__m128i)__builtin_ia32_selectq_128(__U, - (__v2di)_mm_shldv_epi64(__A, __B, __C), - (__v2di)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_shldv_epi32(__m256i __A, __m256i __B, __m256i __C) -{ - return (__m256i)__builtin_ia32_vpshldvd256((__v8si)__A, (__v8si)__B, - (__v8si)__C); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_shldv_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) -{ - return (__m256i)__builtin_ia32_selectd_256(__U, - (__v8si)_mm256_shldv_epi32(__A, __B, __C), - (__v8si)__A); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_shldv_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) -{ - return (__m256i)__builtin_ia32_selectd_256(__U, - (__v8si)_mm256_shldv_epi32(__A, __B, __C), - (__v8si)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_shldv_epi32(__m128i __A, __m128i __B, __m128i __C) -{ - return (__m128i)__builtin_ia32_vpshldvd128((__v4si)__A, (__v4si)__B, - (__v4si)__C); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_shldv_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) -{ - return (__m128i)__builtin_ia32_selectd_128(__U, - (__v4si)_mm_shldv_epi32(__A, __B, __C), - (__v4si)__A); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_shldv_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) -{ - return (__m128i)__builtin_ia32_selectd_128(__U, - (__v4si)_mm_shldv_epi32(__A, __B, __C), - (__v4si)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_shldv_epi16(__m256i __A, __m256i __B, __m256i __C) -{ - return (__m256i)__builtin_ia32_vpshldvw256((__v16hi)__A, (__v16hi)__B, - (__v16hi)__C); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_shldv_epi16(__m256i __A, __mmask16 __U, __m256i __B, __m256i __C) -{ - return (__m256i)__builtin_ia32_selectw_256(__U, - (__v16hi)_mm256_shldv_epi16(__A, __B, __C), - (__v16hi)__A); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_shldv_epi16(__mmask16 __U, __m256i __A, __m256i __B, __m256i __C) -{ - return (__m256i)__builtin_ia32_selectw_256(__U, - (__v16hi)_mm256_shldv_epi16(__A, __B, __C), - (__v16hi)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_shldv_epi16(__m128i __A, __m128i __B, __m128i __C) -{ - return (__m128i)__builtin_ia32_vpshldvw128((__v8hi)__A, (__v8hi)__B, - (__v8hi)__C); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_shldv_epi16(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) -{ - return (__m128i)__builtin_ia32_selectw_128(__U, - (__v8hi)_mm_shldv_epi16(__A, __B, __C), - (__v8hi)__A); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_shldv_epi16(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) -{ - return (__m128i)__builtin_ia32_selectw_128(__U, - (__v8hi)_mm_shldv_epi16(__A, __B, __C), - (__v8hi)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_shrdv_epi64(__m256i __A, __m256i __B, __m256i __C) -{ - return (__m256i)__builtin_ia32_vpshrdvq256((__v4di)__A, (__v4di)__B, - (__v4di)__C); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_shrdv_epi64(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) -{ - return (__m256i)__builtin_ia32_selectq_256(__U, - (__v4di)_mm256_shrdv_epi64(__A, __B, __C), - (__v4di)__A); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_shrdv_epi64(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) -{ - return (__m256i)__builtin_ia32_selectq_256(__U, - (__v4di)_mm256_shrdv_epi64(__A, __B, __C), - (__v4di)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_shrdv_epi64(__m128i __A, __m128i __B, __m128i __C) -{ - return (__m128i)__builtin_ia32_vpshrdvq128((__v2di)__A, (__v2di)__B, - (__v2di)__C); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_shrdv_epi64(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) -{ - return (__m128i)__builtin_ia32_selectq_128(__U, - (__v2di)_mm_shrdv_epi64(__A, __B, __C), - (__v2di)__A); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_shrdv_epi64(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) -{ - return (__m128i)__builtin_ia32_selectq_128(__U, - (__v2di)_mm_shrdv_epi64(__A, __B, __C), - (__v2di)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_shrdv_epi32(__m256i __A, __m256i __B, __m256i __C) -{ - return (__m256i)__builtin_ia32_vpshrdvd256((__v8si)__A, (__v8si)__B, - (__v8si)__C); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_shrdv_epi32(__m256i __A, __mmask8 __U, __m256i __B, __m256i __C) -{ - return (__m256i)__builtin_ia32_selectd_256(__U, - (__v8si)_mm256_shrdv_epi32(__A, __B, __C), - (__v8si)__A); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_shrdv_epi32(__mmask8 __U, __m256i __A, __m256i __B, __m256i __C) -{ - return (__m256i)__builtin_ia32_selectd_256(__U, - (__v8si)_mm256_shrdv_epi32(__A, __B, __C), - (__v8si)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_shrdv_epi32(__m128i __A, __m128i __B, __m128i __C) -{ - return (__m128i)__builtin_ia32_vpshrdvd128((__v4si)__A, (__v4si)__B, - (__v4si)__C); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_shrdv_epi32(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) -{ - return (__m128i)__builtin_ia32_selectd_128(__U, - (__v4si)_mm_shrdv_epi32(__A, __B, __C), - (__v4si)__A); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_shrdv_epi32(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) -{ - return (__m128i)__builtin_ia32_selectd_128(__U, - (__v4si)_mm_shrdv_epi32(__A, __B, __C), - (__v4si)_mm_setzero_si128()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_shrdv_epi16(__m256i __A, __m256i __B, __m256i __C) -{ - return (__m256i)__builtin_ia32_vpshrdvw256((__v16hi)__A, (__v16hi)__B, - (__v16hi)__C); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_shrdv_epi16(__m256i __A, __mmask16 __U, __m256i __B, __m256i __C) -{ - return (__m256i)__builtin_ia32_selectw_256(__U, - (__v16hi)_mm256_shrdv_epi16(__A, __B, __C), - (__v16hi)__A); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_shrdv_epi16(__mmask16 __U, __m256i __A, __m256i __B, __m256i __C) -{ - return (__m256i)__builtin_ia32_selectw_256(__U, - (__v16hi)_mm256_shrdv_epi16(__A, __B, __C), - (__v16hi)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_shrdv_epi16(__m128i __A, __m128i __B, __m128i __C) -{ - return (__m128i)__builtin_ia32_vpshrdvw128((__v8hi)__A, (__v8hi)__B, - (__v8hi)__C); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_shrdv_epi16(__m128i __A, __mmask8 __U, __m128i __B, __m128i __C) -{ - return (__m128i)__builtin_ia32_selectw_128(__U, - (__v8hi)_mm_shrdv_epi16(__A, __B, __C), - (__v8hi)__A); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_shrdv_epi16(__mmask8 __U, __m128i __A, __m128i __B, __m128i __C) -{ - return (__m128i)__builtin_ia32_selectw_128(__U, - (__v8hi)_mm_shrdv_epi16(__A, __B, __C), - (__v8hi)_mm_setzero_si128()); -} - - -#undef __DEFAULT_FN_ATTRS128 -#undef __DEFAULT_FN_ATTRS256 - -#endif diff --git a/include/avx512vlvnniintrin.h b/include/avx512vlvnniintrin.h deleted file mode 100644 index 0fb29af..0000000 --- a/include/avx512vlvnniintrin.h +++ /dev/null @@ -1,304 +0,0 @@ -/*===------------- avx512vlvnniintrin.h - VNNI intrinsics ------------------=== - * - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ -#ifndef __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __AVX512VLVNNIINTRIN_H -#define __AVX512VLVNNIINTRIN_H - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"), __min_vector_width__(128))) -#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vnni"), __min_vector_width__(256))) - -/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with -/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed -/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer -/// in \a S, and store the packed 32-bit results in DST. -/// -/// This intrinsic corresponds to the VPDPBUSD instructions. -/// -/// \operation -/// FOR j := 0 to 7 -/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j])) -/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1])) -/// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2])) -/// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3])) -/// DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 -/// ENDFOR -/// DST[MAX:256] := 0 -/// \endoperation -#define _mm256_dpbusd_epi32(S, A, B) \ - ((__m256i)__builtin_ia32_vpdpbusd256((__v8si)(S), (__v8si)(A), (__v8si)(B))) - -/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with -/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed -/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer -/// in \a S using signed saturation, and store the packed 32-bit results in DST. -/// -/// This intrinsic corresponds to the VPDPBUSDS instructions. -/// -/// \operation -/// FOR j := 0 to 7 -/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j])) -/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1])) -/// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2])) -/// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3])) -/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) -/// ENDFOR -/// DST[MAX:256] := 0 -/// \endoperation -#define _mm256_dpbusds_epi32(S, A, B) \ - ((__m256i)__builtin_ia32_vpdpbusds256((__v8si)(S), (__v8si)(A), (__v8si)(B))) - -/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with -/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit -/// results. Sum these 2 results with the corresponding 32-bit integer in \a S, -/// and store the packed 32-bit results in DST. -/// -/// This intrinsic corresponds to the VPDPWSSD instructions. -/// -/// \operation -/// FOR j := 0 to 7 -/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j]) -/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1]) -/// DST.dword[j] := S.dword[j] + tmp1 + tmp2 -/// ENDFOR -/// DST[MAX:256] := 0 -/// \endoperation -#define _mm256_dpwssd_epi32(S, A, B) \ - ((__m256i)__builtin_ia32_vpdpwssd256((__v8si)(S), (__v8si)(A), (__v8si)(B))) - -/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with -/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit -/// results. Sum these 2 results with the corresponding 32-bit integer in \a S -/// using signed saturation, and store the packed 32-bit results in DST. -/// -/// This intrinsic corresponds to the VPDPWSSDS instructions. -/// -/// \operation -/// FOR j := 0 to 7 -/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j]) -/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1]) -/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2) -/// ENDFOR -/// DST[MAX:256] := 0 -/// \endoperation -#define _mm256_dpwssds_epi32(S, A, B) \ - ((__m256i)__builtin_ia32_vpdpwssds256((__v8si)(S), (__v8si)(A), (__v8si)(B))) - -/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with -/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed -/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer -/// in \a S, and store the packed 32-bit results in DST. -/// -/// This intrinsic corresponds to the VPDPBUSD instructions. -/// -/// \operation -/// FOR j := 0 to 3 -/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j])) -/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1])) -/// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2])) -/// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3])) -/// DST.dword[j] := S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 -/// ENDFOR -/// DST[MAX:128] := 0 -/// \endoperation -#define _mm_dpbusd_epi32(S, A, B) \ - ((__m128i)__builtin_ia32_vpdpbusd128((__v4si)(S), (__v4si)(A), (__v4si)(B))) - -/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a A with -/// corresponding signed 8-bit integers in \a B, producing 4 intermediate signed -/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer -/// in \a S using signed saturation, and store the packed 32-bit results in DST. -/// -/// This intrinsic corresponds to the VPDPBUSDS instructions. -/// -/// \operation -/// FOR j := 0 to 3 -/// tmp1.word := Signed(ZeroExtend16(A.byte[4*j]) * SignExtend16(B.byte[4*j])) -/// tmp2.word := Signed(ZeroExtend16(A.byte[4*j+1]) * SignExtend16(B.byte[4*j+1])) -/// tmp3.word := Signed(ZeroExtend16(A.byte[4*j+2]) * SignExtend16(B.byte[4*j+2])) -/// tmp4.word := Signed(ZeroExtend16(A.byte[4*j+3]) * SignExtend16(B.byte[4*j+3])) -/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) -/// ENDFOR -/// DST[MAX:128] := 0 -/// \endoperation -#define _mm_dpbusds_epi32(S, A, B) \ - ((__m128i)__builtin_ia32_vpdpbusds128((__v4si)(S), (__v4si)(A), (__v4si)(B))) - -/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with -/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit -/// results. Sum these 2 results with the corresponding 32-bit integer in \a S, -/// and store the packed 32-bit results in DST. -/// -/// This intrinsic corresponds to the VPDPWSSD instructions. -/// -/// \operation -/// FOR j := 0 to 3 -/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j]) -/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1]) -/// DST.dword[j] := S.dword[j] + tmp1 + tmp2 -/// ENDFOR -/// DST[MAX:128] := 0 -/// \endoperation -#define _mm_dpwssd_epi32(S, A, B) \ - ((__m128i)__builtin_ia32_vpdpwssd128((__v4si)(S), (__v4si)(A), (__v4si)(B))) - -/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a A with -/// corresponding 16-bit integers in \a B, producing 2 intermediate signed 32-bit -/// results. Sum these 2 results with the corresponding 32-bit integer in \a S -/// using signed saturation, and store the packed 32-bit results in DST. -/// -/// This intrinsic corresponds to the VPDPWSSDS instructions. -/// -/// \operation -/// FOR j := 0 to 3 -/// tmp1.dword := SignExtend32(A.word[2*j]) * SignExtend32(B.word[2*j]) -/// tmp2.dword := SignExtend32(A.word[2*j+1]) * SignExtend32(B.word[2*j+1]) -/// DST.dword[j] := Saturate32(S.dword[j] + tmp1 + tmp2) -/// ENDFOR -/// DST[MAX:128] := 0 -/// \endoperation -#define _mm_dpwssds_epi32(S, A, B) \ - ((__m128i)__builtin_ia32_vpdpwssds128((__v4si)(S), (__v4si)(A), (__v4si)(B))) - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_dpbusd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectd_256(__U, - (__v8si)_mm256_dpbusd_epi32(__S, __A, __B), - (__v8si)__S); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_dpbusd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectd_256(__U, - (__v8si)_mm256_dpbusd_epi32(__S, __A, __B), - (__v8si)_mm256_setzero_si256()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_dpbusds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectd_256(__U, - (__v8si)_mm256_dpbusds_epi32(__S, __A, __B), - (__v8si)__S); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_dpbusds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectd_256(__U, - (__v8si)_mm256_dpbusds_epi32(__S, __A, __B), - (__v8si)_mm256_setzero_si256()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_dpwssd_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectd_256(__U, - (__v8si)_mm256_dpwssd_epi32(__S, __A, __B), - (__v8si)__S); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_dpwssd_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectd_256(__U, - (__v8si)_mm256_dpwssd_epi32(__S, __A, __B), - (__v8si)_mm256_setzero_si256()); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_dpwssds_epi32(__m256i __S, __mmask8 __U, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectd_256(__U, - (__v8si)_mm256_dpwssds_epi32(__S, __A, __B), - (__v8si)__S); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_dpwssds_epi32(__mmask8 __U, __m256i __S, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_selectd_256(__U, - (__v8si)_mm256_dpwssds_epi32(__S, __A, __B), - (__v8si)_mm256_setzero_si256()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_dpbusd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectd_128(__U, - (__v4si)_mm_dpbusd_epi32(__S, __A, __B), - (__v4si)__S); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_dpbusd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectd_128(__U, - (__v4si)_mm_dpbusd_epi32(__S, __A, __B), - (__v4si)_mm_setzero_si128()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_dpbusds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectd_128(__U, - (__v4si)_mm_dpbusds_epi32(__S, __A, __B), - (__v4si)__S); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_dpbusds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectd_128(__U, - (__v4si)_mm_dpbusds_epi32(__S, __A, __B), - (__v4si)_mm_setzero_si128()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_dpwssd_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectd_128(__U, - (__v4si)_mm_dpwssd_epi32(__S, __A, __B), - (__v4si)__S); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_dpwssd_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectd_128(__U, - (__v4si)_mm_dpwssd_epi32(__S, __A, __B), - (__v4si)_mm_setzero_si128()); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_dpwssds_epi32(__m128i __S, __mmask8 __U, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectd_128(__U, - (__v4si)_mm_dpwssds_epi32(__S, __A, __B), - (__v4si)__S); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_dpwssds_epi32(__mmask8 __U, __m128i __S, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_selectd_128(__U, - (__v4si)_mm_dpwssds_epi32(__S, __A, __B), - (__v4si)_mm_setzero_si128()); -} - -#undef __DEFAULT_FN_ATTRS128 -#undef __DEFAULT_FN_ATTRS256 - -#endif diff --git a/include/avx512vlvp2intersectintrin.h b/include/avx512vlvp2intersectintrin.h deleted file mode 100644 index 3e0815e..0000000 --- a/include/avx512vlvp2intersectintrin.h +++ /dev/null @@ -1,121 +0,0 @@ -/*===------ avx512vlvp2intersectintrin.h - VL VP2INTERSECT intrinsics ------=== - * - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - * - *===-----------------------------------------------------------------------=== - */ -#ifndef __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef _AVX512VLVP2INTERSECT_H -#define _AVX512VLVP2INTERSECT_H - -#define __DEFAULT_FN_ATTRS128 \ - __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vp2intersect"), \ - __min_vector_width__(128))) - -#define __DEFAULT_FN_ATTRS256 \ - __attribute__((__always_inline__, __nodebug__, __target__("avx512vl,avx512vp2intersect"), \ - __min_vector_width__(256))) -/// Store, in an even/odd pair of mask registers, the indicators of the -/// locations of value matches between dwords in operands __a and __b. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VP2INTERSECTD instruction. -/// -/// \param __a -/// A 256-bit vector of [8 x i32]. -/// \param __b -/// A 256-bit vector of [8 x i32] -/// \param __m0 -/// A pointer point to 8-bit mask -/// \param __m1 -/// A pointer point to 8-bit mask -static __inline__ void __DEFAULT_FN_ATTRS256 -_mm256_2intersect_epi32(__m256i __a, __m256i __b, __mmask8 *__m0, __mmask8 *__m1) { - __builtin_ia32_vp2intersect_d_256((__v8si)__a, (__v8si)__b, __m0, __m1); -} - -/// Store, in an even/odd pair of mask registers, the indicators of the -/// locations of value matches between quadwords in operands __a and __b. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VP2INTERSECTQ instruction. -/// -/// \param __a -/// A 256-bit vector of [4 x i64]. -/// \param __b -/// A 256-bit vector of [4 x i64] -/// \param __m0 -/// A pointer point to 8-bit mask -/// \param __m1 -/// A pointer point to 8-bit mask -static __inline__ void __DEFAULT_FN_ATTRS256 -_mm256_2intersect_epi64(__m256i __a, __m256i __b, __mmask8 *__m0, __mmask8 *__m1) { - __builtin_ia32_vp2intersect_q_256((__v4di)__a, (__v4di)__b, __m0, __m1); -} - -/// Store, in an even/odd pair of mask registers, the indicators of the -/// locations of value matches between dwords in operands __a and __b. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VP2INTERSECTD instruction. -/// -/// \param __a -/// A 128-bit vector of [4 x i32]. -/// \param __b -/// A 128-bit vector of [4 x i32] -/// \param __m0 -/// A pointer point to 8-bit mask -/// \param __m1 -/// A pointer point to 8-bit mask -static __inline__ void __DEFAULT_FN_ATTRS128 -_mm_2intersect_epi32(__m128i __a, __m128i __b, __mmask8 *__m0, __mmask8 *__m1) { - __builtin_ia32_vp2intersect_d_128((__v4si)__a, (__v4si)__b, __m0, __m1); -} - -/// Store, in an even/odd pair of mask registers, the indicators of the -/// locations of value matches between quadwords in operands __a and __b. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VP2INTERSECTQ instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x i64]. -/// \param __b -/// A 128-bit vector of [2 x i64] -/// \param __m0 -/// A pointer point to 8-bit mask -/// \param __m1 -/// A pointer point to 8-bit mask -static __inline__ void __DEFAULT_FN_ATTRS128 -_mm_2intersect_epi64(__m128i __a, __m128i __b, __mmask8 *__m0, __mmask8 *__m1) { - __builtin_ia32_vp2intersect_q_128((__v2di)__a, (__v2di)__b, __m0, __m1); -} - -#undef __DEFAULT_FN_ATTRS128 -#undef __DEFAULT_FN_ATTRS256 - -#endif diff --git a/include/avx512vnniintrin.h b/include/avx512vnniintrin.h deleted file mode 100644 index 9935a11..0000000 --- a/include/avx512vnniintrin.h +++ /dev/null @@ -1,115 +0,0 @@ -/*===------------- avx512vnniintrin.h - VNNI intrinsics ------------------=== - * - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ -#ifndef __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __AVX512VNNIINTRIN_H -#define __AVX512VNNIINTRIN_H - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx512vnni"), __min_vector_width__(512))) - - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_dpbusd_epi32(__m512i __S, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_vpdpbusd512((__v16si)__S, (__v16si)__A, - (__v16si)__B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_dpbusd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectd_512(__U, - (__v16si)_mm512_dpbusd_epi32(__S, __A, __B), - (__v16si)__S); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_dpbusd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectd_512(__U, - (__v16si)_mm512_dpbusd_epi32(__S, __A, __B), - (__v16si)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_dpbusds_epi32(__m512i __S, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_vpdpbusds512((__v16si)__S, (__v16si)__A, - (__v16si)__B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_dpbusds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectd_512(__U, - (__v16si)_mm512_dpbusds_epi32(__S, __A, __B), - (__v16si)__S); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_dpbusds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectd_512(__U, - (__v16si)_mm512_dpbusds_epi32(__S, __A, __B), - (__v16si)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_dpwssd_epi32(__m512i __S, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_vpdpwssd512((__v16si)__S, (__v16si)__A, - (__v16si)__B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_dpwssd_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectd_512(__U, - (__v16si)_mm512_dpwssd_epi32(__S, __A, __B), - (__v16si)__S); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_dpwssd_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectd_512(__U, - (__v16si)_mm512_dpwssd_epi32(__S, __A, __B), - (__v16si)_mm512_setzero_si512()); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_dpwssds_epi32(__m512i __S, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_vpdpwssds512((__v16si)__S, (__v16si)__A, - (__v16si)__B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_dpwssds_epi32(__m512i __S, __mmask16 __U, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectd_512(__U, - (__v16si)_mm512_dpwssds_epi32(__S, __A, __B), - (__v16si)__S); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_dpwssds_epi32(__mmask16 __U, __m512i __S, __m512i __A, __m512i __B) -{ - return (__m512i)__builtin_ia32_selectd_512(__U, - (__v16si)_mm512_dpwssds_epi32(__S, __A, __B), - (__v16si)_mm512_setzero_si512()); -} - -#undef __DEFAULT_FN_ATTRS - -#endif diff --git a/include/avx512vp2intersectintrin.h b/include/avx512vp2intersectintrin.h deleted file mode 100644 index 5d3cb48..0000000 --- a/include/avx512vp2intersectintrin.h +++ /dev/null @@ -1,77 +0,0 @@ -/*===------- avx512vpintersectintrin.h - VP2INTERSECT intrinsics ------------=== - * - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - * - *===-----------------------------------------------------------------------=== - */ -#ifndef __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef _AVX512VP2INTERSECT_H -#define _AVX512VP2INTERSECT_H - -#define __DEFAULT_FN_ATTRS \ - __attribute__((__always_inline__, __nodebug__, __target__("avx512vp2intersect"), \ - __min_vector_width__(512))) - -/// Store, in an even/odd pair of mask registers, the indicators of the -/// locations of value matches between dwords in operands __a and __b. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VP2INTERSECTD instruction. -/// -/// \param __a -/// A 512-bit vector of [16 x i32]. -/// \param __b -/// A 512-bit vector of [16 x i32] -/// \param __m0 -/// A pointer point to 16-bit mask -/// \param __m1 -/// A pointer point to 16-bit mask -static __inline__ void __DEFAULT_FN_ATTRS -_mm512_2intersect_epi32(__m512i __a, __m512i __b, __mmask16 *__m0, __mmask16 *__m1) { - __builtin_ia32_vp2intersect_d_512((__v16si)__a, (__v16si)__b, __m0, __m1); -} - -/// Store, in an even/odd pair of mask registers, the indicators of the -/// locations of value matches between quadwords in operands __a and __b. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VP2INTERSECTQ instruction. -/// -/// \param __a -/// A 512-bit vector of [8 x i64]. -/// \param __b -/// A 512-bit vector of [8 x i64] -/// \param __m0 -/// A pointer point to 8-bit mask -/// \param __m1 -/// A pointer point to 8-bit mask -static __inline__ void __DEFAULT_FN_ATTRS -_mm512_2intersect_epi64(__m512i __a, __m512i __b, __mmask8 *__m0, __mmask8 *__m1) { - __builtin_ia32_vp2intersect_q_512((__v8di)__a, (__v8di)__b, __m0, __m1); -} - -#undef __DEFAULT_FN_ATTRS - -#endif diff --git a/include/avx512vpopcntdqintrin.h b/include/avx512vpopcntdqintrin.h deleted file mode 100644 index bb435e6..0000000 --- a/include/avx512vpopcntdqintrin.h +++ /dev/null @@ -1,54 +0,0 @@ -/*===----- avx512vpopcntdqintrin.h - AVX512VPOPCNTDQ intrinsics-------------=== - * - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ -#ifndef __IMMINTRIN_H -#error \ - "Never use directly; include instead." -#endif - -#ifndef __AVX512VPOPCNTDQINTRIN_H -#define __AVX512VPOPCNTDQINTRIN_H - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS \ - __attribute__((__always_inline__, __nodebug__, __target__("avx512vpopcntdq"), __min_vector_width__(512))) - -static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_popcnt_epi64(__m512i __A) { - return (__m512i)__builtin_ia32_vpopcntq_512((__v8di)__A); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_popcnt_epi64(__m512i __W, __mmask8 __U, __m512i __A) { - return (__m512i)__builtin_ia32_selectq_512( - (__mmask8)__U, (__v8di)_mm512_popcnt_epi64(__A), (__v8di)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_popcnt_epi64(__mmask8 __U, __m512i __A) { - return _mm512_mask_popcnt_epi64((__m512i)_mm512_setzero_si512(), __U, __A); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS _mm512_popcnt_epi32(__m512i __A) { - return (__m512i)__builtin_ia32_vpopcntd_512((__v16si)__A); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_mask_popcnt_epi32(__m512i __W, __mmask16 __U, __m512i __A) { - return (__m512i)__builtin_ia32_selectd_512( - (__mmask16)__U, (__v16si)_mm512_popcnt_epi32(__A), (__v16si)__W); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS -_mm512_maskz_popcnt_epi32(__mmask16 __U, __m512i __A) { - return _mm512_mask_popcnt_epi32((__m512i)_mm512_setzero_si512(), __U, __A); -} - -#undef __DEFAULT_FN_ATTRS - -#endif diff --git a/include/avx512vpopcntdqvlintrin.h b/include/avx512vpopcntdqvlintrin.h deleted file mode 100644 index a3cb9b6..0000000 --- a/include/avx512vpopcntdqvlintrin.h +++ /dev/null @@ -1,91 +0,0 @@ -/*===---- avx512vpopcntdqintrin.h - AVX512VPOPCNTDQ intrinsics -------------=== - * - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ -#ifndef __IMMINTRIN_H -#error \ - "Never use directly; include instead." -#endif - -#ifndef __AVX512VPOPCNTDQVLINTRIN_H -#define __AVX512VPOPCNTDQVLINTRIN_H - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS128 \ - __attribute__((__always_inline__, __nodebug__, __target__("avx512vpopcntdq,avx512vl"), __min_vector_width__(128))) -#define __DEFAULT_FN_ATTRS256 \ - __attribute__((__always_inline__, __nodebug__, __target__("avx512vpopcntdq,avx512vl"), __min_vector_width__(256))) - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_popcnt_epi64(__m128i __A) { - return (__m128i)__builtin_ia32_vpopcntq_128((__v2di)__A); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_popcnt_epi64(__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_selectq_128( - (__mmask8)__U, (__v2di)_mm_popcnt_epi64(__A), (__v2di)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_popcnt_epi64(__mmask8 __U, __m128i __A) { - return _mm_mask_popcnt_epi64((__m128i)_mm_setzero_si128(), __U, __A); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_popcnt_epi32(__m128i __A) { - return (__m128i)__builtin_ia32_vpopcntd_128((__v4si)__A); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_mask_popcnt_epi32(__m128i __W, __mmask8 __U, __m128i __A) { - return (__m128i)__builtin_ia32_selectd_128( - (__mmask8)__U, (__v4si)_mm_popcnt_epi32(__A), (__v4si)__W); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_maskz_popcnt_epi32(__mmask8 __U, __m128i __A) { - return _mm_mask_popcnt_epi32((__m128i)_mm_setzero_si128(), __U, __A); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_popcnt_epi64(__m256i __A) { - return (__m256i)__builtin_ia32_vpopcntq_256((__v4di)__A); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_popcnt_epi64(__m256i __W, __mmask8 __U, __m256i __A) { - return (__m256i)__builtin_ia32_selectq_256( - (__mmask8)__U, (__v4di)_mm256_popcnt_epi64(__A), (__v4di)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_popcnt_epi64(__mmask8 __U, __m256i __A) { - return _mm256_mask_popcnt_epi64((__m256i)_mm256_setzero_si256(), __U, __A); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_popcnt_epi32(__m256i __A) { - return (__m256i)__builtin_ia32_vpopcntd_256((__v8si)__A); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_mask_popcnt_epi32(__m256i __W, __mmask8 __U, __m256i __A) { - return (__m256i)__builtin_ia32_selectd_256( - (__mmask8)__U, (__v8si)_mm256_popcnt_epi32(__A), (__v8si)__W); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_maskz_popcnt_epi32(__mmask8 __U, __m256i __A) { - return _mm256_mask_popcnt_epi32((__m256i)_mm256_setzero_si256(), __U, __A); -} - -#undef __DEFAULT_FN_ATTRS128 -#undef __DEFAULT_FN_ATTRS256 - -#endif diff --git a/include/avxintrin.h b/include/avxintrin.h deleted file mode 100644 index 2f2a159..0000000 --- a/include/avxintrin.h +++ /dev/null @@ -1,5062 +0,0 @@ -/*===---- avxintrin.h - AVX intrinsics -------------------------------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ - -#ifndef __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __AVXINTRIN_H -#define __AVXINTRIN_H - -typedef double __v4df __attribute__ ((__vector_size__ (32))); -typedef float __v8sf __attribute__ ((__vector_size__ (32))); -typedef long long __v4di __attribute__ ((__vector_size__ (32))); -typedef int __v8si __attribute__ ((__vector_size__ (32))); -typedef short __v16hi __attribute__ ((__vector_size__ (32))); -typedef char __v32qi __attribute__ ((__vector_size__ (32))); - -/* Unsigned types */ -typedef unsigned long long __v4du __attribute__ ((__vector_size__ (32))); -typedef unsigned int __v8su __attribute__ ((__vector_size__ (32))); -typedef unsigned short __v16hu __attribute__ ((__vector_size__ (32))); -typedef unsigned char __v32qu __attribute__ ((__vector_size__ (32))); - -/* We need an explicitly signed variant for char. Note that this shouldn't - * appear in the interface though. */ -typedef signed char __v32qs __attribute__((__vector_size__(32))); - -typedef float __m256 __attribute__ ((__vector_size__ (32), __aligned__(32))); -typedef double __m256d __attribute__((__vector_size__(32), __aligned__(32))); -typedef long long __m256i __attribute__((__vector_size__(32), __aligned__(32))); - -typedef float __m256_u __attribute__ ((__vector_size__ (32), __aligned__(1))); -typedef double __m256d_u __attribute__((__vector_size__(32), __aligned__(1))); -typedef long long __m256i_u __attribute__((__vector_size__(32), __aligned__(1))); - -#if (__clang_major__ > 15) -#ifdef __SSE2__ -/* Both _Float16 and __bf16 require SSE2 being enabled. */ -typedef _Float16 __v16hf __attribute__((__vector_size__(32), __aligned__(32))); -typedef _Float16 __m256h __attribute__((__vector_size__(32), __aligned__(32))); -typedef _Float16 __m256h_u __attribute__((__vector_size__(32), __aligned__(1))); - -typedef __bf16 __v16bf __attribute__((__vector_size__(32), __aligned__(32))); -typedef __bf16 __m256bh __attribute__((__vector_size__(32), __aligned__(32))); -#endif -#endif - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("avx"), __min_vector_width__(256))) -#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avx"), __min_vector_width__(128))) - -/* Arithmetic */ -/// Adds two 256-bit vectors of [4 x double]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VADDPD instruction. -/// -/// \param __a -/// A 256-bit vector of [4 x double] containing one of the source operands. -/// \param __b -/// A 256-bit vector of [4 x double] containing one of the source operands. -/// \returns A 256-bit vector of [4 x double] containing the sums of both -/// operands. -static __inline __m256d __DEFAULT_FN_ATTRS -_mm256_add_pd(__m256d __a, __m256d __b) -{ - return (__m256d)((__v4df)__a+(__v4df)__b); -} - -/// Adds two 256-bit vectors of [8 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VADDPS instruction. -/// -/// \param __a -/// A 256-bit vector of [8 x float] containing one of the source operands. -/// \param __b -/// A 256-bit vector of [8 x float] containing one of the source operands. -/// \returns A 256-bit vector of [8 x float] containing the sums of both -/// operands. -static __inline __m256 __DEFAULT_FN_ATTRS -_mm256_add_ps(__m256 __a, __m256 __b) -{ - return (__m256)((__v8sf)__a+(__v8sf)__b); -} - -/// Subtracts two 256-bit vectors of [4 x double]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VSUBPD instruction. -/// -/// \param __a -/// A 256-bit vector of [4 x double] containing the minuend. -/// \param __b -/// A 256-bit vector of [4 x double] containing the subtrahend. -/// \returns A 256-bit vector of [4 x double] containing the differences between -/// both operands. -static __inline __m256d __DEFAULT_FN_ATTRS -_mm256_sub_pd(__m256d __a, __m256d __b) -{ - return (__m256d)((__v4df)__a-(__v4df)__b); -} - -/// Subtracts two 256-bit vectors of [8 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VSUBPS instruction. -/// -/// \param __a -/// A 256-bit vector of [8 x float] containing the minuend. -/// \param __b -/// A 256-bit vector of [8 x float] containing the subtrahend. -/// \returns A 256-bit vector of [8 x float] containing the differences between -/// both operands. -static __inline __m256 __DEFAULT_FN_ATTRS -_mm256_sub_ps(__m256 __a, __m256 __b) -{ - return (__m256)((__v8sf)__a-(__v8sf)__b); -} - -/// Adds the even-indexed values and subtracts the odd-indexed values of -/// two 256-bit vectors of [4 x double]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VADDSUBPD instruction. -/// -/// \param __a -/// A 256-bit vector of [4 x double] containing the left source operand. -/// \param __b -/// A 256-bit vector of [4 x double] containing the right source operand. -/// \returns A 256-bit vector of [4 x double] containing the alternating sums -/// and differences between both operands. -static __inline __m256d __DEFAULT_FN_ATTRS -_mm256_addsub_pd(__m256d __a, __m256d __b) -{ - return (__m256d)__builtin_ia32_addsubpd256((__v4df)__a, (__v4df)__b); -} - -/// Adds the even-indexed values and subtracts the odd-indexed values of -/// two 256-bit vectors of [8 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VADDSUBPS instruction. -/// -/// \param __a -/// A 256-bit vector of [8 x float] containing the left source operand. -/// \param __b -/// A 256-bit vector of [8 x float] containing the right source operand. -/// \returns A 256-bit vector of [8 x float] containing the alternating sums and -/// differences between both operands. -static __inline __m256 __DEFAULT_FN_ATTRS -_mm256_addsub_ps(__m256 __a, __m256 __b) -{ - return (__m256)__builtin_ia32_addsubps256((__v8sf)__a, (__v8sf)__b); -} - -/// Divides two 256-bit vectors of [4 x double]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VDIVPD instruction. -/// -/// \param __a -/// A 256-bit vector of [4 x double] containing the dividend. -/// \param __b -/// A 256-bit vector of [4 x double] containing the divisor. -/// \returns A 256-bit vector of [4 x double] containing the quotients of both -/// operands. -static __inline __m256d __DEFAULT_FN_ATTRS -_mm256_div_pd(__m256d __a, __m256d __b) -{ - return (__m256d)((__v4df)__a/(__v4df)__b); -} - -/// Divides two 256-bit vectors of [8 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VDIVPS instruction. -/// -/// \param __a -/// A 256-bit vector of [8 x float] containing the dividend. -/// \param __b -/// A 256-bit vector of [8 x float] containing the divisor. -/// \returns A 256-bit vector of [8 x float] containing the quotients of both -/// operands. -static __inline __m256 __DEFAULT_FN_ATTRS -_mm256_div_ps(__m256 __a, __m256 __b) -{ - return (__m256)((__v8sf)__a/(__v8sf)__b); -} - -/// Compares two 256-bit vectors of [4 x double] and returns the greater -/// of each pair of values. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMAXPD instruction. -/// -/// \param __a -/// A 256-bit vector of [4 x double] containing one of the operands. -/// \param __b -/// A 256-bit vector of [4 x double] containing one of the operands. -/// \returns A 256-bit vector of [4 x double] containing the maximum values -/// between both operands. -static __inline __m256d __DEFAULT_FN_ATTRS -_mm256_max_pd(__m256d __a, __m256d __b) -{ - return (__m256d)__builtin_ia32_maxpd256((__v4df)__a, (__v4df)__b); -} - -/// Compares two 256-bit vectors of [8 x float] and returns the greater -/// of each pair of values. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMAXPS instruction. -/// -/// \param __a -/// A 256-bit vector of [8 x float] containing one of the operands. -/// \param __b -/// A 256-bit vector of [8 x float] containing one of the operands. -/// \returns A 256-bit vector of [8 x float] containing the maximum values -/// between both operands. -static __inline __m256 __DEFAULT_FN_ATTRS -_mm256_max_ps(__m256 __a, __m256 __b) -{ - return (__m256)__builtin_ia32_maxps256((__v8sf)__a, (__v8sf)__b); -} - -/// Compares two 256-bit vectors of [4 x double] and returns the lesser -/// of each pair of values. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMINPD instruction. -/// -/// \param __a -/// A 256-bit vector of [4 x double] containing one of the operands. -/// \param __b -/// A 256-bit vector of [4 x double] containing one of the operands. -/// \returns A 256-bit vector of [4 x double] containing the minimum values -/// between both operands. -static __inline __m256d __DEFAULT_FN_ATTRS -_mm256_min_pd(__m256d __a, __m256d __b) -{ - return (__m256d)__builtin_ia32_minpd256((__v4df)__a, (__v4df)__b); -} - -/// Compares two 256-bit vectors of [8 x float] and returns the lesser -/// of each pair of values. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMINPS instruction. -/// -/// \param __a -/// A 256-bit vector of [8 x float] containing one of the operands. -/// \param __b -/// A 256-bit vector of [8 x float] containing one of the operands. -/// \returns A 256-bit vector of [8 x float] containing the minimum values -/// between both operands. -static __inline __m256 __DEFAULT_FN_ATTRS -_mm256_min_ps(__m256 __a, __m256 __b) -{ - return (__m256)__builtin_ia32_minps256((__v8sf)__a, (__v8sf)__b); -} - -/// Multiplies two 256-bit vectors of [4 x double]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMULPD instruction. -/// -/// \param __a -/// A 256-bit vector of [4 x double] containing one of the operands. -/// \param __b -/// A 256-bit vector of [4 x double] containing one of the operands. -/// \returns A 256-bit vector of [4 x double] containing the products of both -/// operands. -static __inline __m256d __DEFAULT_FN_ATTRS -_mm256_mul_pd(__m256d __a, __m256d __b) -{ - return (__m256d)((__v4df)__a * (__v4df)__b); -} - -/// Multiplies two 256-bit vectors of [8 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMULPS instruction. -/// -/// \param __a -/// A 256-bit vector of [8 x float] containing one of the operands. -/// \param __b -/// A 256-bit vector of [8 x float] containing one of the operands. -/// \returns A 256-bit vector of [8 x float] containing the products of both -/// operands. -static __inline __m256 __DEFAULT_FN_ATTRS -_mm256_mul_ps(__m256 __a, __m256 __b) -{ - return (__m256)((__v8sf)__a * (__v8sf)__b); -} - -/// Calculates the square roots of the values in a 256-bit vector of -/// [4 x double]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VSQRTPD instruction. -/// -/// \param __a -/// A 256-bit vector of [4 x double]. -/// \returns A 256-bit vector of [4 x double] containing the square roots of the -/// values in the operand. -static __inline __m256d __DEFAULT_FN_ATTRS -_mm256_sqrt_pd(__m256d __a) -{ - return (__m256d)__builtin_ia32_sqrtpd256((__v4df)__a); -} - -/// Calculates the square roots of the values in a 256-bit vector of -/// [8 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VSQRTPS instruction. -/// -/// \param __a -/// A 256-bit vector of [8 x float]. -/// \returns A 256-bit vector of [8 x float] containing the square roots of the -/// values in the operand. -static __inline __m256 __DEFAULT_FN_ATTRS -_mm256_sqrt_ps(__m256 __a) -{ - return (__m256)__builtin_ia32_sqrtps256((__v8sf)__a); -} - -/// Calculates the reciprocal square roots of the values in a 256-bit -/// vector of [8 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VRSQRTPS instruction. -/// -/// \param __a -/// A 256-bit vector of [8 x float]. -/// \returns A 256-bit vector of [8 x float] containing the reciprocal square -/// roots of the values in the operand. -static __inline __m256 __DEFAULT_FN_ATTRS -_mm256_rsqrt_ps(__m256 __a) -{ - return (__m256)__builtin_ia32_rsqrtps256((__v8sf)__a); -} - -/// Calculates the reciprocals of the values in a 256-bit vector of -/// [8 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VRCPPS instruction. -/// -/// \param __a -/// A 256-bit vector of [8 x float]. -/// \returns A 256-bit vector of [8 x float] containing the reciprocals of the -/// values in the operand. -static __inline __m256 __DEFAULT_FN_ATTRS -_mm256_rcp_ps(__m256 __a) -{ - return (__m256)__builtin_ia32_rcpps256((__v8sf)__a); -} - -/// Rounds the values in a 256-bit vector of [4 x double] as specified -/// by the byte operand. The source values are rounded to integer values and -/// returned as 64-bit double-precision floating-point values. -/// -/// \headerfile -/// -/// \code -/// __m256d _mm256_round_pd(__m256d V, const int M); -/// \endcode -/// -/// This intrinsic corresponds to the VROUNDPD instruction. -/// -/// \param V -/// A 256-bit vector of [4 x double]. -/// \param M -/// An integer value that specifies the rounding operation. \n -/// Bits [7:4] are reserved. \n -/// Bit [3] is a precision exception value: \n -/// 0: A normal PE exception is used. \n -/// 1: The PE field is not updated. \n -/// Bit [2] is the rounding control source: \n -/// 0: Use bits [1:0] of \a M. \n -/// 1: Use the current MXCSR setting. \n -/// Bits [1:0] contain the rounding control definition: \n -/// 00: Nearest. \n -/// 01: Downward (toward negative infinity). \n -/// 10: Upward (toward positive infinity). \n -/// 11: Truncated. -/// \returns A 256-bit vector of [4 x double] containing the rounded values. -#define _mm256_round_pd(V, M) \ - ((__m256d)__builtin_ia32_roundpd256((__v4df)(__m256d)(V), (M))) - -/// Rounds the values stored in a 256-bit vector of [8 x float] as -/// specified by the byte operand. The source values are rounded to integer -/// values and returned as floating-point values. -/// -/// \headerfile -/// -/// \code -/// __m256 _mm256_round_ps(__m256 V, const int M); -/// \endcode -/// -/// This intrinsic corresponds to the VROUNDPS instruction. -/// -/// \param V -/// A 256-bit vector of [8 x float]. -/// \param M -/// An integer value that specifies the rounding operation. \n -/// Bits [7:4] are reserved. \n -/// Bit [3] is a precision exception value: \n -/// 0: A normal PE exception is used. \n -/// 1: The PE field is not updated. \n -/// Bit [2] is the rounding control source: \n -/// 0: Use bits [1:0] of \a M. \n -/// 1: Use the current MXCSR setting. \n -/// Bits [1:0] contain the rounding control definition: \n -/// 00: Nearest. \n -/// 01: Downward (toward negative infinity). \n -/// 10: Upward (toward positive infinity). \n -/// 11: Truncated. -/// \returns A 256-bit vector of [8 x float] containing the rounded values. -#define _mm256_round_ps(V, M) \ - ((__m256)__builtin_ia32_roundps256((__v8sf)(__m256)(V), (M))) - -/// Rounds up the values stored in a 256-bit vector of [4 x double]. The -/// source values are rounded up to integer values and returned as 64-bit -/// double-precision floating-point values. -/// -/// \headerfile -/// -/// \code -/// __m256d _mm256_ceil_pd(__m256d V); -/// \endcode -/// -/// This intrinsic corresponds to the VROUNDPD instruction. -/// -/// \param V -/// A 256-bit vector of [4 x double]. -/// \returns A 256-bit vector of [4 x double] containing the rounded up values. -#define _mm256_ceil_pd(V) _mm256_round_pd((V), _MM_FROUND_CEIL) - -/// Rounds down the values stored in a 256-bit vector of [4 x double]. -/// The source values are rounded down to integer values and returned as -/// 64-bit double-precision floating-point values. -/// -/// \headerfile -/// -/// \code -/// __m256d _mm256_floor_pd(__m256d V); -/// \endcode -/// -/// This intrinsic corresponds to the VROUNDPD instruction. -/// -/// \param V -/// A 256-bit vector of [4 x double]. -/// \returns A 256-bit vector of [4 x double] containing the rounded down -/// values. -#define _mm256_floor_pd(V) _mm256_round_pd((V), _MM_FROUND_FLOOR) - -/// Rounds up the values stored in a 256-bit vector of [8 x float]. The -/// source values are rounded up to integer values and returned as -/// floating-point values. -/// -/// \headerfile -/// -/// \code -/// __m256 _mm256_ceil_ps(__m256 V); -/// \endcode -/// -/// This intrinsic corresponds to the VROUNDPS instruction. -/// -/// \param V -/// A 256-bit vector of [8 x float]. -/// \returns A 256-bit vector of [8 x float] containing the rounded up values. -#define _mm256_ceil_ps(V) _mm256_round_ps((V), _MM_FROUND_CEIL) - -/// Rounds down the values stored in a 256-bit vector of [8 x float]. The -/// source values are rounded down to integer values and returned as -/// floating-point values. -/// -/// \headerfile -/// -/// \code -/// __m256 _mm256_floor_ps(__m256 V); -/// \endcode -/// -/// This intrinsic corresponds to the VROUNDPS instruction. -/// -/// \param V -/// A 256-bit vector of [8 x float]. -/// \returns A 256-bit vector of [8 x float] containing the rounded down values. -#define _mm256_floor_ps(V) _mm256_round_ps((V), _MM_FROUND_FLOOR) - -/* Logical */ -/// Performs a bitwise AND of two 256-bit vectors of [4 x double]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VANDPD instruction. -/// -/// \param __a -/// A 256-bit vector of [4 x double] containing one of the source operands. -/// \param __b -/// A 256-bit vector of [4 x double] containing one of the source operands. -/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the -/// values between both operands. -static __inline __m256d __DEFAULT_FN_ATTRS -_mm256_and_pd(__m256d __a, __m256d __b) -{ - return (__m256d)((__v4du)__a & (__v4du)__b); -} - -/// Performs a bitwise AND of two 256-bit vectors of [8 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VANDPS instruction. -/// -/// \param __a -/// A 256-bit vector of [8 x float] containing one of the source operands. -/// \param __b -/// A 256-bit vector of [8 x float] containing one of the source operands. -/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the -/// values between both operands. -static __inline __m256 __DEFAULT_FN_ATTRS -_mm256_and_ps(__m256 __a, __m256 __b) -{ - return (__m256)((__v8su)__a & (__v8su)__b); -} - -/// Performs a bitwise AND of two 256-bit vectors of [4 x double], using -/// the one's complement of the values contained in the first source operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VANDNPD instruction. -/// -/// \param __a -/// A 256-bit vector of [4 x double] containing the left source operand. The -/// one's complement of this value is used in the bitwise AND. -/// \param __b -/// A 256-bit vector of [4 x double] containing the right source operand. -/// \returns A 256-bit vector of [4 x double] containing the bitwise AND of the -/// values of the second operand and the one's complement of the first -/// operand. -static __inline __m256d __DEFAULT_FN_ATTRS -_mm256_andnot_pd(__m256d __a, __m256d __b) -{ - return (__m256d)(~(__v4du)__a & (__v4du)__b); -} - -/// Performs a bitwise AND of two 256-bit vectors of [8 x float], using -/// the one's complement of the values contained in the first source operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VANDNPS instruction. -/// -/// \param __a -/// A 256-bit vector of [8 x float] containing the left source operand. The -/// one's complement of this value is used in the bitwise AND. -/// \param __b -/// A 256-bit vector of [8 x float] containing the right source operand. -/// \returns A 256-bit vector of [8 x float] containing the bitwise AND of the -/// values of the second operand and the one's complement of the first -/// operand. -static __inline __m256 __DEFAULT_FN_ATTRS -_mm256_andnot_ps(__m256 __a, __m256 __b) -{ - return (__m256)(~(__v8su)__a & (__v8su)__b); -} - -/// Performs a bitwise OR of two 256-bit vectors of [4 x double]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VORPD instruction. -/// -/// \param __a -/// A 256-bit vector of [4 x double] containing one of the source operands. -/// \param __b -/// A 256-bit vector of [4 x double] containing one of the source operands. -/// \returns A 256-bit vector of [4 x double] containing the bitwise OR of the -/// values between both operands. -static __inline __m256d __DEFAULT_FN_ATTRS -_mm256_or_pd(__m256d __a, __m256d __b) -{ - return (__m256d)((__v4du)__a | (__v4du)__b); -} - -/// Performs a bitwise OR of two 256-bit vectors of [8 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VORPS instruction. -/// -/// \param __a -/// A 256-bit vector of [8 x float] containing one of the source operands. -/// \param __b -/// A 256-bit vector of [8 x float] containing one of the source operands. -/// \returns A 256-bit vector of [8 x float] containing the bitwise OR of the -/// values between both operands. -static __inline __m256 __DEFAULT_FN_ATTRS -_mm256_or_ps(__m256 __a, __m256 __b) -{ - return (__m256)((__v8su)__a | (__v8su)__b); -} - -/// Performs a bitwise XOR of two 256-bit vectors of [4 x double]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VXORPD instruction. -/// -/// \param __a -/// A 256-bit vector of [4 x double] containing one of the source operands. -/// \param __b -/// A 256-bit vector of [4 x double] containing one of the source operands. -/// \returns A 256-bit vector of [4 x double] containing the bitwise XOR of the -/// values between both operands. -static __inline __m256d __DEFAULT_FN_ATTRS -_mm256_xor_pd(__m256d __a, __m256d __b) -{ - return (__m256d)((__v4du)__a ^ (__v4du)__b); -} - -/// Performs a bitwise XOR of two 256-bit vectors of [8 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VXORPS instruction. -/// -/// \param __a -/// A 256-bit vector of [8 x float] containing one of the source operands. -/// \param __b -/// A 256-bit vector of [8 x float] containing one of the source operands. -/// \returns A 256-bit vector of [8 x float] containing the bitwise XOR of the -/// values between both operands. -static __inline __m256 __DEFAULT_FN_ATTRS -_mm256_xor_ps(__m256 __a, __m256 __b) -{ - return (__m256)((__v8su)__a ^ (__v8su)__b); -} - -/* Horizontal arithmetic */ -/// Horizontally adds the adjacent pairs of values contained in two -/// 256-bit vectors of [4 x double]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VHADDPD instruction. -/// -/// \param __a -/// A 256-bit vector of [4 x double] containing one of the source operands. -/// The horizontal sums of the values are returned in the even-indexed -/// elements of a vector of [4 x double]. -/// \param __b -/// A 256-bit vector of [4 x double] containing one of the source operands. -/// The horizontal sums of the values are returned in the odd-indexed -/// elements of a vector of [4 x double]. -/// \returns A 256-bit vector of [4 x double] containing the horizontal sums of -/// both operands. -static __inline __m256d __DEFAULT_FN_ATTRS -_mm256_hadd_pd(__m256d __a, __m256d __b) -{ - return (__m256d)__builtin_ia32_haddpd256((__v4df)__a, (__v4df)__b); -} - -/// Horizontally adds the adjacent pairs of values contained in two -/// 256-bit vectors of [8 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VHADDPS instruction. -/// -/// \param __a -/// A 256-bit vector of [8 x float] containing one of the source operands. -/// The horizontal sums of the values are returned in the elements with -/// index 0, 1, 4, 5 of a vector of [8 x float]. -/// \param __b -/// A 256-bit vector of [8 x float] containing one of the source operands. -/// The horizontal sums of the values are returned in the elements with -/// index 2, 3, 6, 7 of a vector of [8 x float]. -/// \returns A 256-bit vector of [8 x float] containing the horizontal sums of -/// both operands. -static __inline __m256 __DEFAULT_FN_ATTRS -_mm256_hadd_ps(__m256 __a, __m256 __b) -{ - return (__m256)__builtin_ia32_haddps256((__v8sf)__a, (__v8sf)__b); -} - -/// Horizontally subtracts the adjacent pairs of values contained in two -/// 256-bit vectors of [4 x double]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VHSUBPD instruction. -/// -/// \param __a -/// A 256-bit vector of [4 x double] containing one of the source operands. -/// The horizontal differences between the values are returned in the -/// even-indexed elements of a vector of [4 x double]. -/// \param __b -/// A 256-bit vector of [4 x double] containing one of the source operands. -/// The horizontal differences between the values are returned in the -/// odd-indexed elements of a vector of [4 x double]. -/// \returns A 256-bit vector of [4 x double] containing the horizontal -/// differences of both operands. -static __inline __m256d __DEFAULT_FN_ATTRS -_mm256_hsub_pd(__m256d __a, __m256d __b) -{ - return (__m256d)__builtin_ia32_hsubpd256((__v4df)__a, (__v4df)__b); -} - -/// Horizontally subtracts the adjacent pairs of values contained in two -/// 256-bit vectors of [8 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VHSUBPS instruction. -/// -/// \param __a -/// A 256-bit vector of [8 x float] containing one of the source operands. -/// The horizontal differences between the values are returned in the -/// elements with index 0, 1, 4, 5 of a vector of [8 x float]. -/// \param __b -/// A 256-bit vector of [8 x float] containing one of the source operands. -/// The horizontal differences between the values are returned in the -/// elements with index 2, 3, 6, 7 of a vector of [8 x float]. -/// \returns A 256-bit vector of [8 x float] containing the horizontal -/// differences of both operands. -static __inline __m256 __DEFAULT_FN_ATTRS -_mm256_hsub_ps(__m256 __a, __m256 __b) -{ - return (__m256)__builtin_ia32_hsubps256((__v8sf)__a, (__v8sf)__b); -} - -/* Vector permutations */ -/// Copies the values in a 128-bit vector of [2 x double] as specified -/// by the 128-bit integer vector operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPERMILPD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. -/// \param __c -/// A 128-bit integer vector operand specifying how the values are to be -/// copied. \n -/// Bit [1]: \n -/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned -/// vector. \n -/// 1: Bits [127:64] of the source are copied to bits [63:0] of the -/// returned vector. \n -/// Bit [65]: \n -/// 0: Bits [63:0] of the source are copied to bits [127:64] of the -/// returned vector. \n -/// 1: Bits [127:64] of the source are copied to bits [127:64] of the -/// returned vector. -/// \returns A 128-bit vector of [2 x double] containing the copied values. -static __inline __m128d __DEFAULT_FN_ATTRS128 -_mm_permutevar_pd(__m128d __a, __m128i __c) -{ - return (__m128d)__builtin_ia32_vpermilvarpd((__v2df)__a, (__v2di)__c); -} - -/// Copies the values in a 256-bit vector of [4 x double] as specified -/// by the 256-bit integer vector operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPERMILPD instruction. -/// -/// \param __a -/// A 256-bit vector of [4 x double]. -/// \param __c -/// A 256-bit integer vector operand specifying how the values are to be -/// copied. \n -/// Bit [1]: \n -/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned -/// vector. \n -/// 1: Bits [127:64] of the source are copied to bits [63:0] of the -/// returned vector. \n -/// Bit [65]: \n -/// 0: Bits [63:0] of the source are copied to bits [127:64] of the -/// returned vector. \n -/// 1: Bits [127:64] of the source are copied to bits [127:64] of the -/// returned vector. \n -/// Bit [129]: \n -/// 0: Bits [191:128] of the source are copied to bits [191:128] of the -/// returned vector. \n -/// 1: Bits [255:192] of the source are copied to bits [191:128] of the -/// returned vector. \n -/// Bit [193]: \n -/// 0: Bits [191:128] of the source are copied to bits [255:192] of the -/// returned vector. \n -/// 1: Bits [255:192] of the source are copied to bits [255:192] of the -/// returned vector. -/// \returns A 256-bit vector of [4 x double] containing the copied values. -static __inline __m256d __DEFAULT_FN_ATTRS -_mm256_permutevar_pd(__m256d __a, __m256i __c) -{ - return (__m256d)__builtin_ia32_vpermilvarpd256((__v4df)__a, (__v4di)__c); -} - -/// Copies the values stored in a 128-bit vector of [4 x float] as -/// specified by the 128-bit integer vector operand. -/// \headerfile -/// -/// This intrinsic corresponds to the VPERMILPS instruction. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. -/// \param __c -/// A 128-bit integer vector operand specifying how the values are to be -/// copied. \n -/// Bits [1:0]: \n -/// 00: Bits [31:0] of the source are copied to bits [31:0] of the -/// returned vector. \n -/// 01: Bits [63:32] of the source are copied to bits [31:0] of the -/// returned vector. \n -/// 10: Bits [95:64] of the source are copied to bits [31:0] of the -/// returned vector. \n -/// 11: Bits [127:96] of the source are copied to bits [31:0] of the -/// returned vector. \n -/// Bits [33:32]: \n -/// 00: Bits [31:0] of the source are copied to bits [63:32] of the -/// returned vector. \n -/// 01: Bits [63:32] of the source are copied to bits [63:32] of the -/// returned vector. \n -/// 10: Bits [95:64] of the source are copied to bits [63:32] of the -/// returned vector. \n -/// 11: Bits [127:96] of the source are copied to bits [63:32] of the -/// returned vector. \n -/// Bits [65:64]: \n -/// 00: Bits [31:0] of the source are copied to bits [95:64] of the -/// returned vector. \n -/// 01: Bits [63:32] of the source are copied to bits [95:64] of the -/// returned vector. \n -/// 10: Bits [95:64] of the source are copied to bits [95:64] of the -/// returned vector. \n -/// 11: Bits [127:96] of the source are copied to bits [95:64] of the -/// returned vector. \n -/// Bits [97:96]: \n -/// 00: Bits [31:0] of the source are copied to bits [127:96] of the -/// returned vector. \n -/// 01: Bits [63:32] of the source are copied to bits [127:96] of the -/// returned vector. \n -/// 10: Bits [95:64] of the source are copied to bits [127:96] of the -/// returned vector. \n -/// 11: Bits [127:96] of the source are copied to bits [127:96] of the -/// returned vector. -/// \returns A 128-bit vector of [4 x float] containing the copied values. -static __inline __m128 __DEFAULT_FN_ATTRS128 -_mm_permutevar_ps(__m128 __a, __m128i __c) -{ - return (__m128)__builtin_ia32_vpermilvarps((__v4sf)__a, (__v4si)__c); -} - -/// Copies the values stored in a 256-bit vector of [8 x float] as -/// specified by the 256-bit integer vector operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPERMILPS instruction. -/// -/// \param __a -/// A 256-bit vector of [8 x float]. -/// \param __c -/// A 256-bit integer vector operand specifying how the values are to be -/// copied. \n -/// Bits [1:0]: \n -/// 00: Bits [31:0] of the source are copied to bits [31:0] of the -/// returned vector. \n -/// 01: Bits [63:32] of the source are copied to bits [31:0] of the -/// returned vector. \n -/// 10: Bits [95:64] of the source are copied to bits [31:0] of the -/// returned vector. \n -/// 11: Bits [127:96] of the source are copied to bits [31:0] of the -/// returned vector. \n -/// Bits [33:32]: \n -/// 00: Bits [31:0] of the source are copied to bits [63:32] of the -/// returned vector. \n -/// 01: Bits [63:32] of the source are copied to bits [63:32] of the -/// returned vector. \n -/// 10: Bits [95:64] of the source are copied to bits [63:32] of the -/// returned vector. \n -/// 11: Bits [127:96] of the source are copied to bits [63:32] of the -/// returned vector. \n -/// Bits [65:64]: \n -/// 00: Bits [31:0] of the source are copied to bits [95:64] of the -/// returned vector. \n -/// 01: Bits [63:32] of the source are copied to bits [95:64] of the -/// returned vector. \n -/// 10: Bits [95:64] of the source are copied to bits [95:64] of the -/// returned vector. \n -/// 11: Bits [127:96] of the source are copied to bits [95:64] of the -/// returned vector. \n -/// Bits [97:96]: \n -/// 00: Bits [31:0] of the source are copied to bits [127:96] of the -/// returned vector. \n -/// 01: Bits [63:32] of the source are copied to bits [127:96] of the -/// returned vector. \n -/// 10: Bits [95:64] of the source are copied to bits [127:96] of the -/// returned vector. \n -/// 11: Bits [127:96] of the source are copied to bits [127:96] of the -/// returned vector. \n -/// Bits [129:128]: \n -/// 00: Bits [159:128] of the source are copied to bits [159:128] of the -/// returned vector. \n -/// 01: Bits [191:160] of the source are copied to bits [159:128] of the -/// returned vector. \n -/// 10: Bits [223:192] of the source are copied to bits [159:128] of the -/// returned vector. \n -/// 11: Bits [255:224] of the source are copied to bits [159:128] of the -/// returned vector. \n -/// Bits [161:160]: \n -/// 00: Bits [159:128] of the source are copied to bits [191:160] of the -/// returned vector. \n -/// 01: Bits [191:160] of the source are copied to bits [191:160] of the -/// returned vector. \n -/// 10: Bits [223:192] of the source are copied to bits [191:160] of the -/// returned vector. \n -/// 11: Bits [255:224] of the source are copied to bits [191:160] of the -/// returned vector. \n -/// Bits [193:192]: \n -/// 00: Bits [159:128] of the source are copied to bits [223:192] of the -/// returned vector. \n -/// 01: Bits [191:160] of the source are copied to bits [223:192] of the -/// returned vector. \n -/// 10: Bits [223:192] of the source are copied to bits [223:192] of the -/// returned vector. \n -/// 11: Bits [255:224] of the source are copied to bits [223:192] of the -/// returned vector. \n -/// Bits [225:224]: \n -/// 00: Bits [159:128] of the source are copied to bits [255:224] of the -/// returned vector. \n -/// 01: Bits [191:160] of the source are copied to bits [255:224] of the -/// returned vector. \n -/// 10: Bits [223:192] of the source are copied to bits [255:224] of the -/// returned vector. \n -/// 11: Bits [255:224] of the source are copied to bits [255:224] of the -/// returned vector. -/// \returns A 256-bit vector of [8 x float] containing the copied values. -static __inline __m256 __DEFAULT_FN_ATTRS -_mm256_permutevar_ps(__m256 __a, __m256i __c) -{ - return (__m256)__builtin_ia32_vpermilvarps256((__v8sf)__a, (__v8si)__c); -} - -/// Copies the values in a 128-bit vector of [2 x double] as specified -/// by the immediate integer operand. -/// -/// \headerfile -/// -/// \code -/// __m128d _mm_permute_pd(__m128d A, const int C); -/// \endcode -/// -/// This intrinsic corresponds to the VPERMILPD instruction. -/// -/// \param A -/// A 128-bit vector of [2 x double]. -/// \param C -/// An immediate integer operand specifying how the values are to be -/// copied. \n -/// Bit [0]: \n -/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned -/// vector. \n -/// 1: Bits [127:64] of the source are copied to bits [63:0] of the -/// returned vector. \n -/// Bit [1]: \n -/// 0: Bits [63:0] of the source are copied to bits [127:64] of the -/// returned vector. \n -/// 1: Bits [127:64] of the source are copied to bits [127:64] of the -/// returned vector. -/// \returns A 128-bit vector of [2 x double] containing the copied values. -#define _mm_permute_pd(A, C) \ - ((__m128d)__builtin_ia32_vpermilpd((__v2df)(__m128d)(A), (int)(C))) - -/// Copies the values in a 256-bit vector of [4 x double] as specified by -/// the immediate integer operand. -/// -/// \headerfile -/// -/// \code -/// __m256d _mm256_permute_pd(__m256d A, const int C); -/// \endcode -/// -/// This intrinsic corresponds to the VPERMILPD instruction. -/// -/// \param A -/// A 256-bit vector of [4 x double]. -/// \param C -/// An immediate integer operand specifying how the values are to be -/// copied. \n -/// Bit [0]: \n -/// 0: Bits [63:0] of the source are copied to bits [63:0] of the returned -/// vector. \n -/// 1: Bits [127:64] of the source are copied to bits [63:0] of the -/// returned vector. \n -/// Bit [1]: \n -/// 0: Bits [63:0] of the source are copied to bits [127:64] of the -/// returned vector. \n -/// 1: Bits [127:64] of the source are copied to bits [127:64] of the -/// returned vector. \n -/// Bit [2]: \n -/// 0: Bits [191:128] of the source are copied to bits [191:128] of the -/// returned vector. \n -/// 1: Bits [255:192] of the source are copied to bits [191:128] of the -/// returned vector. \n -/// Bit [3]: \n -/// 0: Bits [191:128] of the source are copied to bits [255:192] of the -/// returned vector. \n -/// 1: Bits [255:192] of the source are copied to bits [255:192] of the -/// returned vector. -/// \returns A 256-bit vector of [4 x double] containing the copied values. -#define _mm256_permute_pd(A, C) \ - ((__m256d)__builtin_ia32_vpermilpd256((__v4df)(__m256d)(A), (int)(C))) - -/// Copies the values in a 128-bit vector of [4 x float] as specified by -/// the immediate integer operand. -/// -/// \headerfile -/// -/// \code -/// __m128 _mm_permute_ps(__m128 A, const int C); -/// \endcode -/// -/// This intrinsic corresponds to the VPERMILPS instruction. -/// -/// \param A -/// A 128-bit vector of [4 x float]. -/// \param C -/// An immediate integer operand specifying how the values are to be -/// copied. \n -/// Bits [1:0]: \n -/// 00: Bits [31:0] of the source are copied to bits [31:0] of the -/// returned vector. \n -/// 01: Bits [63:32] of the source are copied to bits [31:0] of the -/// returned vector. \n -/// 10: Bits [95:64] of the source are copied to bits [31:0] of the -/// returned vector. \n -/// 11: Bits [127:96] of the source are copied to bits [31:0] of the -/// returned vector. \n -/// Bits [3:2]: \n -/// 00: Bits [31:0] of the source are copied to bits [63:32] of the -/// returned vector. \n -/// 01: Bits [63:32] of the source are copied to bits [63:32] of the -/// returned vector. \n -/// 10: Bits [95:64] of the source are copied to bits [63:32] of the -/// returned vector. \n -/// 11: Bits [127:96] of the source are copied to bits [63:32] of the -/// returned vector. \n -/// Bits [5:4]: \n -/// 00: Bits [31:0] of the source are copied to bits [95:64] of the -/// returned vector. \n -/// 01: Bits [63:32] of the source are copied to bits [95:64] of the -/// returned vector. \n -/// 10: Bits [95:64] of the source are copied to bits [95:64] of the -/// returned vector. \n -/// 11: Bits [127:96] of the source are copied to bits [95:64] of the -/// returned vector. \n -/// Bits [7:6]: \n -/// 00: Bits [31:0] of the source are copied to bits [127:96] of the -/// returned vector. \n -/// 01: Bits [63:32] of the source are copied to bits [127:96] of the -/// returned vector. \n -/// 10: Bits [95:64] of the source are copied to bits [127:96] of the -/// returned vector. \n -/// 11: Bits [127:96] of the source are copied to bits [127:96] of the -/// returned vector. -/// \returns A 128-bit vector of [4 x float] containing the copied values. -#define _mm_permute_ps(A, C) \ - ((__m128)__builtin_ia32_vpermilps((__v4sf)(__m128)(A), (int)(C))) - -/// Copies the values in a 256-bit vector of [8 x float] as specified by -/// the immediate integer operand. -/// -/// \headerfile -/// -/// \code -/// __m256 _mm256_permute_ps(__m256 A, const int C); -/// \endcode -/// -/// This intrinsic corresponds to the VPERMILPS instruction. -/// -/// \param A -/// A 256-bit vector of [8 x float]. -/// \param C -/// An immediate integer operand specifying how the values are to be -/// copied. \n -/// Bits [1:0]: \n -/// 00: Bits [31:0] of the source are copied to bits [31:0] of the -/// returned vector. \n -/// 01: Bits [63:32] of the source are copied to bits [31:0] of the -/// returned vector. \n -/// 10: Bits [95:64] of the source are copied to bits [31:0] of the -/// returned vector. \n -/// 11: Bits [127:96] of the source are copied to bits [31:0] of the -/// returned vector. \n -/// Bits [3:2]: \n -/// 00: Bits [31:0] of the source are copied to bits [63:32] of the -/// returned vector. \n -/// 01: Bits [63:32] of the source are copied to bits [63:32] of the -/// returned vector. \n -/// 10: Bits [95:64] of the source are copied to bits [63:32] of the -/// returned vector. \n -/// 11: Bits [127:96] of the source are copied to bits [63:32] of the -/// returned vector. \n -/// Bits [5:4]: \n -/// 00: Bits [31:0] of the source are copied to bits [95:64] of the -/// returned vector. \n -/// 01: Bits [63:32] of the source are copied to bits [95:64] of the -/// returned vector. \n -/// 10: Bits [95:64] of the source are copied to bits [95:64] of the -/// returned vector. \n -/// 11: Bits [127:96] of the source are copied to bits [95:64] of the -/// returned vector. \n -/// Bits [7:6]: \n -/// 00: Bits [31:0] of the source are copied to bits [127:96] of the -/// returned vector. \n -/// 01: Bits [63:32] of the source are copied to bits [127:96] of the -/// returned vector. \n -/// 10: Bits [95:64] of the source are copied to bits [127:96] of the -/// returned vector. \n -/// 11: Bits [127:96] of the source are copied to bits [127:96] of the -/// returned vector. \n -/// Bits [1:0]: \n -/// 00: Bits [159:128] of the source are copied to bits [159:128] of the -/// returned vector. \n -/// 01: Bits [191:160] of the source are copied to bits [159:128] of the -/// returned vector. \n -/// 10: Bits [223:192] of the source are copied to bits [159:128] of the -/// returned vector. \n -/// 11: Bits [255:224] of the source are copied to bits [159:128] of the -/// returned vector. \n -/// Bits [3:2]: \n -/// 00: Bits [159:128] of the source are copied to bits [191:160] of the -/// returned vector. \n -/// 01: Bits [191:160] of the source are copied to bits [191:160] of the -/// returned vector. \n -/// 10: Bits [223:192] of the source are copied to bits [191:160] of the -/// returned vector. \n -/// 11: Bits [255:224] of the source are copied to bits [191:160] of the -/// returned vector. \n -/// Bits [5:4]: \n -/// 00: Bits [159:128] of the source are copied to bits [223:192] of the -/// returned vector. \n -/// 01: Bits [191:160] of the source are copied to bits [223:192] of the -/// returned vector. \n -/// 10: Bits [223:192] of the source are copied to bits [223:192] of the -/// returned vector. \n -/// 11: Bits [255:224] of the source are copied to bits [223:192] of the -/// returned vector. \n -/// Bits [7:6]: \n -/// 00: Bits [159:128] of the source are copied to bits [255:224] of the -/// returned vector. \n -/// 01: Bits [191:160] of the source are copied to bits [255:224] of the -/// returned vector. \n -/// 10: Bits [223:192] of the source are copied to bits [255:224] of the -/// returned vector. \n -/// 11: Bits [255:224] of the source are copied to bits [255:224] of the -/// returned vector. -/// \returns A 256-bit vector of [8 x float] containing the copied values. -#define _mm256_permute_ps(A, C) \ - ((__m256)__builtin_ia32_vpermilps256((__v8sf)(__m256)(A), (int)(C))) - -/// Permutes 128-bit data values stored in two 256-bit vectors of -/// [4 x double], as specified by the immediate integer operand. -/// -/// \headerfile -/// -/// \code -/// __m256d _mm256_permute2f128_pd(__m256d V1, __m256d V2, const int M); -/// \endcode -/// -/// This intrinsic corresponds to the VPERM2F128 instruction. -/// -/// \param V1 -/// A 256-bit vector of [4 x double]. -/// \param V2 -/// A 256-bit vector of [4 x double. -/// \param M -/// An immediate integer operand specifying how the values are to be -/// permuted. \n -/// Bits [1:0]: \n -/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the -/// destination. \n -/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the -/// destination. \n -/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the -/// destination. \n -/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the -/// destination. \n -/// Bits [5:4]: \n -/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the -/// destination. \n -/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the -/// destination. \n -/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the -/// destination. \n -/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the -/// destination. -/// \returns A 256-bit vector of [4 x double] containing the copied values. -#define _mm256_permute2f128_pd(V1, V2, M) \ - ((__m256d)__builtin_ia32_vperm2f128_pd256((__v4df)(__m256d)(V1), \ - (__v4df)(__m256d)(V2), (int)(M))) - -/// Permutes 128-bit data values stored in two 256-bit vectors of -/// [8 x float], as specified by the immediate integer operand. -/// -/// \headerfile -/// -/// \code -/// __m256 _mm256_permute2f128_ps(__m256 V1, __m256 V2, const int M); -/// \endcode -/// -/// This intrinsic corresponds to the VPERM2F128 instruction. -/// -/// \param V1 -/// A 256-bit vector of [8 x float]. -/// \param V2 -/// A 256-bit vector of [8 x float]. -/// \param M -/// An immediate integer operand specifying how the values are to be -/// permuted. \n -/// Bits [1:0]: \n -/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the -/// destination. \n -/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the -/// destination. \n -/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the -/// destination. \n -/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the -/// destination. \n -/// Bits [5:4]: \n -/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the -/// destination. \n -/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the -/// destination. \n -/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the -/// destination. \n -/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the -/// destination. -/// \returns A 256-bit vector of [8 x float] containing the copied values. -#define _mm256_permute2f128_ps(V1, V2, M) \ - ((__m256)__builtin_ia32_vperm2f128_ps256((__v8sf)(__m256)(V1), \ - (__v8sf)(__m256)(V2), (int)(M))) - -/// Permutes 128-bit data values stored in two 256-bit integer vectors, -/// as specified by the immediate integer operand. -/// -/// \headerfile -/// -/// \code -/// __m256i _mm256_permute2f128_si256(__m256i V1, __m256i V2, const int M); -/// \endcode -/// -/// This intrinsic corresponds to the VPERM2F128 instruction. -/// -/// \param V1 -/// A 256-bit integer vector. -/// \param V2 -/// A 256-bit integer vector. -/// \param M -/// An immediate integer operand specifying how the values are to be copied. -/// Bits [1:0]: \n -/// 00: Bits [127:0] of operand \a V1 are copied to bits [127:0] of the -/// destination. \n -/// 01: Bits [255:128] of operand \a V1 are copied to bits [127:0] of the -/// destination. \n -/// 10: Bits [127:0] of operand \a V2 are copied to bits [127:0] of the -/// destination. \n -/// 11: Bits [255:128] of operand \a V2 are copied to bits [127:0] of the -/// destination. \n -/// Bits [5:4]: \n -/// 00: Bits [127:0] of operand \a V1 are copied to bits [255:128] of the -/// destination. \n -/// 01: Bits [255:128] of operand \a V1 are copied to bits [255:128] of the -/// destination. \n -/// 10: Bits [127:0] of operand \a V2 are copied to bits [255:128] of the -/// destination. \n -/// 11: Bits [255:128] of operand \a V2 are copied to bits [255:128] of the -/// destination. -/// \returns A 256-bit integer vector containing the copied values. -#define _mm256_permute2f128_si256(V1, V2, M) \ - ((__m256i)__builtin_ia32_vperm2f128_si256((__v8si)(__m256i)(V1), \ - (__v8si)(__m256i)(V2), (int)(M))) - -/* Vector Blend */ -/// Merges 64-bit double-precision data values stored in either of the -/// two 256-bit vectors of [4 x double], as specified by the immediate -/// integer operand. -/// -/// \headerfile -/// -/// \code -/// __m256d _mm256_blend_pd(__m256d V1, __m256d V2, const int M); -/// \endcode -/// -/// This intrinsic corresponds to the VBLENDPD instruction. -/// -/// \param V1 -/// A 256-bit vector of [4 x double]. -/// \param V2 -/// A 256-bit vector of [4 x double]. -/// \param M -/// An immediate integer operand, with mask bits [3:0] specifying how the -/// values are to be copied. The position of the mask bit corresponds to the -/// index of a copied value. When a mask bit is 0, the corresponding 64-bit -/// element in operand \a V1 is copied to the same position in the -/// destination. When a mask bit is 1, the corresponding 64-bit element in -/// operand \a V2 is copied to the same position in the destination. -/// \returns A 256-bit vector of [4 x double] containing the copied values. -#define _mm256_blend_pd(V1, V2, M) \ - ((__m256d)__builtin_ia32_blendpd256((__v4df)(__m256d)(V1), \ - (__v4df)(__m256d)(V2), (int)(M))) - -/// Merges 32-bit single-precision data values stored in either of the -/// two 256-bit vectors of [8 x float], as specified by the immediate -/// integer operand. -/// -/// \headerfile -/// -/// \code -/// __m256 _mm256_blend_ps(__m256 V1, __m256 V2, const int M); -/// \endcode -/// -/// This intrinsic corresponds to the VBLENDPS instruction. -/// -/// \param V1 -/// A 256-bit vector of [8 x float]. -/// \param V2 -/// A 256-bit vector of [8 x float]. -/// \param M -/// An immediate integer operand, with mask bits [7:0] specifying how the -/// values are to be copied. The position of the mask bit corresponds to the -/// index of a copied value. When a mask bit is 0, the corresponding 32-bit -/// element in operand \a V1 is copied to the same position in the -/// destination. When a mask bit is 1, the corresponding 32-bit element in -/// operand \a V2 is copied to the same position in the destination. -/// \returns A 256-bit vector of [8 x float] containing the copied values. -#define _mm256_blend_ps(V1, V2, M) \ - ((__m256)__builtin_ia32_blendps256((__v8sf)(__m256)(V1), \ - (__v8sf)(__m256)(V2), (int)(M))) - -/// Merges 64-bit double-precision data values stored in either of the -/// two 256-bit vectors of [4 x double], as specified by the 256-bit vector -/// operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VBLENDVPD instruction. -/// -/// \param __a -/// A 256-bit vector of [4 x double]. -/// \param __b -/// A 256-bit vector of [4 x double]. -/// \param __c -/// A 256-bit vector operand, with mask bits 255, 191, 127, and 63 specifying -/// how the values are to be copied. The position of the mask bit corresponds -/// to the most significant bit of a copied value. When a mask bit is 0, the -/// corresponding 64-bit element in operand \a __a is copied to the same -/// position in the destination. When a mask bit is 1, the corresponding -/// 64-bit element in operand \a __b is copied to the same position in the -/// destination. -/// \returns A 256-bit vector of [4 x double] containing the copied values. -static __inline __m256d __DEFAULT_FN_ATTRS -_mm256_blendv_pd(__m256d __a, __m256d __b, __m256d __c) -{ - return (__m256d)__builtin_ia32_blendvpd256( - (__v4df)__a, (__v4df)__b, (__v4df)__c); -} - -/// Merges 32-bit single-precision data values stored in either of the -/// two 256-bit vectors of [8 x float], as specified by the 256-bit vector -/// operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VBLENDVPS instruction. -/// -/// \param __a -/// A 256-bit vector of [8 x float]. -/// \param __b -/// A 256-bit vector of [8 x float]. -/// \param __c -/// A 256-bit vector operand, with mask bits 255, 223, 191, 159, 127, 95, 63, -/// and 31 specifying how the values are to be copied. The position of the -/// mask bit corresponds to the most significant bit of a copied value. When -/// a mask bit is 0, the corresponding 32-bit element in operand \a __a is -/// copied to the same position in the destination. When a mask bit is 1, the -/// corresponding 32-bit element in operand \a __b is copied to the same -/// position in the destination. -/// \returns A 256-bit vector of [8 x float] containing the copied values. -static __inline __m256 __DEFAULT_FN_ATTRS -_mm256_blendv_ps(__m256 __a, __m256 __b, __m256 __c) -{ - return (__m256)__builtin_ia32_blendvps256( - (__v8sf)__a, (__v8sf)__b, (__v8sf)__c); -} - -/* Vector Dot Product */ -/// Computes two dot products in parallel, using the lower and upper -/// halves of two [8 x float] vectors as input to the two computations, and -/// returning the two dot products in the lower and upper halves of the -/// [8 x float] result. -/// -/// The immediate integer operand controls which input elements will -/// contribute to the dot product, and where the final results are returned. -/// In general, for each dot product, the four corresponding elements of the -/// input vectors are multiplied; the first two and second two products are -/// summed, then the two sums are added to form the final result. -/// -/// \headerfile -/// -/// \code -/// __m256 _mm256_dp_ps(__m256 V1, __m256 V2, const int M); -/// \endcode -/// -/// This intrinsic corresponds to the VDPPS instruction. -/// -/// \param V1 -/// A vector of [8 x float] values, treated as two [4 x float] vectors. -/// \param V2 -/// A vector of [8 x float] values, treated as two [4 x float] vectors. -/// \param M -/// An immediate integer argument. Bits [7:4] determine which elements of -/// the input vectors are used, with bit [4] corresponding to the lowest -/// element and bit [7] corresponding to the highest element of each [4 x -/// float] subvector. If a bit is set, the corresponding elements from the -/// two input vectors are used as an input for dot product; otherwise that -/// input is treated as zero. Bits [3:0] determine which elements of the -/// result will receive a copy of the final dot product, with bit [0] -/// corresponding to the lowest element and bit [3] corresponding to the -/// highest element of each [4 x float] subvector. If a bit is set, the dot -/// product is returned in the corresponding element; otherwise that element -/// is set to zero. The bitmask is applied in the same way to each of the -/// two parallel dot product computations. -/// \returns A 256-bit vector of [8 x float] containing the two dot products. -#define _mm256_dp_ps(V1, V2, M) \ - ((__m256)__builtin_ia32_dpps256((__v8sf)(__m256)(V1), \ - (__v8sf)(__m256)(V2), (M))) - -/* Vector shuffle */ -/// Selects 8 float values from the 256-bit operands of [8 x float], as -/// specified by the immediate value operand. -/// -/// The four selected elements in each operand are copied to the destination -/// according to the bits specified in the immediate operand. The selected -/// elements from the first 256-bit operand are copied to bits [63:0] and -/// bits [191:128] of the destination, and the selected elements from the -/// second 256-bit operand are copied to bits [127:64] and bits [255:192] of -/// the destination. For example, if bits [7:0] of the immediate operand -/// contain a value of 0xFF, the 256-bit destination vector would contain the -/// following values: b[7], b[7], a[7], a[7], b[3], b[3], a[3], a[3]. -/// -/// \headerfile -/// -/// \code -/// __m256 _mm256_shuffle_ps(__m256 a, __m256 b, const int mask); -/// \endcode -/// -/// This intrinsic corresponds to the VSHUFPS instruction. -/// -/// \param a -/// A 256-bit vector of [8 x float]. The four selected elements in this -/// operand are copied to bits [63:0] and bits [191:128] in the destination, -/// according to the bits specified in the immediate operand. -/// \param b -/// A 256-bit vector of [8 x float]. The four selected elements in this -/// operand are copied to bits [127:64] and bits [255:192] in the -/// destination, according to the bits specified in the immediate operand. -/// \param mask -/// An immediate value containing an 8-bit value specifying which elements to -/// copy from \a a and \a b \n. -/// Bits [3:0] specify the values copied from operand \a a. \n -/// Bits [7:4] specify the values copied from operand \a b. \n -/// The destinations within the 256-bit destination are assigned values as -/// follows, according to the bit value assignments described below: \n -/// Bits [1:0] are used to assign values to bits [31:0] and [159:128] in the -/// destination. \n -/// Bits [3:2] are used to assign values to bits [63:32] and [191:160] in the -/// destination. \n -/// Bits [5:4] are used to assign values to bits [95:64] and [223:192] in the -/// destination. \n -/// Bits [7:6] are used to assign values to bits [127:96] and [255:224] in -/// the destination. \n -/// Bit value assignments: \n -/// 00: Bits [31:0] and [159:128] are copied from the selected operand. \n -/// 01: Bits [63:32] and [191:160] are copied from the selected operand. \n -/// 10: Bits [95:64] and [223:192] are copied from the selected operand. \n -/// 11: Bits [127:96] and [255:224] are copied from the selected operand. -/// \returns A 256-bit vector of [8 x float] containing the shuffled values. -#define _mm256_shuffle_ps(a, b, mask) \ - ((__m256)__builtin_ia32_shufps256((__v8sf)(__m256)(a), \ - (__v8sf)(__m256)(b), (int)(mask))) - -/// Selects four double-precision values from the 256-bit operands of -/// [4 x double], as specified by the immediate value operand. -/// -/// The selected elements from the first 256-bit operand are copied to bits -/// [63:0] and bits [191:128] in the destination, and the selected elements -/// from the second 256-bit operand are copied to bits [127:64] and bits -/// [255:192] in the destination. For example, if bits [3:0] of the immediate -/// operand contain a value of 0xF, the 256-bit destination vector would -/// contain the following values: b[3], a[3], b[1], a[1]. -/// -/// \headerfile -/// -/// \code -/// __m256d _mm256_shuffle_pd(__m256d a, __m256d b, const int mask); -/// \endcode -/// -/// This intrinsic corresponds to the VSHUFPD instruction. -/// -/// \param a -/// A 256-bit vector of [4 x double]. -/// \param b -/// A 256-bit vector of [4 x double]. -/// \param mask -/// An immediate value containing 8-bit values specifying which elements to -/// copy from \a a and \a b: \n -/// Bit [0]=0: Bits [63:0] are copied from \a a to bits [63:0] of the -/// destination. \n -/// Bit [0]=1: Bits [127:64] are copied from \a a to bits [63:0] of the -/// destination. \n -/// Bit [1]=0: Bits [63:0] are copied from \a b to bits [127:64] of the -/// destination. \n -/// Bit [1]=1: Bits [127:64] are copied from \a b to bits [127:64] of the -/// destination. \n -/// Bit [2]=0: Bits [191:128] are copied from \a a to bits [191:128] of the -/// destination. \n -/// Bit [2]=1: Bits [255:192] are copied from \a a to bits [191:128] of the -/// destination. \n -/// Bit [3]=0: Bits [191:128] are copied from \a b to bits [255:192] of the -/// destination. \n -/// Bit [3]=1: Bits [255:192] are copied from \a b to bits [255:192] of the -/// destination. -/// \returns A 256-bit vector of [4 x double] containing the shuffled values. -#define _mm256_shuffle_pd(a, b, mask) \ - ((__m256d)__builtin_ia32_shufpd256((__v4df)(__m256d)(a), \ - (__v4df)(__m256d)(b), (int)(mask))) - -/* Compare */ -#define _CMP_EQ_OQ 0x00 /* Equal (ordered, non-signaling) */ -#define _CMP_LT_OS 0x01 /* Less-than (ordered, signaling) */ -#define _CMP_LE_OS 0x02 /* Less-than-or-equal (ordered, signaling) */ -#define _CMP_UNORD_Q 0x03 /* Unordered (non-signaling) */ -#define _CMP_NEQ_UQ 0x04 /* Not-equal (unordered, non-signaling) */ -#define _CMP_NLT_US 0x05 /* Not-less-than (unordered, signaling) */ -#define _CMP_NLE_US 0x06 /* Not-less-than-or-equal (unordered, signaling) */ -#define _CMP_ORD_Q 0x07 /* Ordered (non-signaling) */ -#define _CMP_EQ_UQ 0x08 /* Equal (unordered, non-signaling) */ -#define _CMP_NGE_US 0x09 /* Not-greater-than-or-equal (unordered, signaling) */ -#define _CMP_NGT_US 0x0a /* Not-greater-than (unordered, signaling) */ -#define _CMP_FALSE_OQ 0x0b /* False (ordered, non-signaling) */ -#define _CMP_NEQ_OQ 0x0c /* Not-equal (ordered, non-signaling) */ -#define _CMP_GE_OS 0x0d /* Greater-than-or-equal (ordered, signaling) */ -#define _CMP_GT_OS 0x0e /* Greater-than (ordered, signaling) */ -#define _CMP_TRUE_UQ 0x0f /* True (unordered, non-signaling) */ -#define _CMP_EQ_OS 0x10 /* Equal (ordered, signaling) */ -#define _CMP_LT_OQ 0x11 /* Less-than (ordered, non-signaling) */ -#define _CMP_LE_OQ 0x12 /* Less-than-or-equal (ordered, non-signaling) */ -#define _CMP_UNORD_S 0x13 /* Unordered (signaling) */ -#define _CMP_NEQ_US 0x14 /* Not-equal (unordered, signaling) */ -#define _CMP_NLT_UQ 0x15 /* Not-less-than (unordered, non-signaling) */ -#define _CMP_NLE_UQ 0x16 /* Not-less-than-or-equal (unordered, non-signaling) */ -#define _CMP_ORD_S 0x17 /* Ordered (signaling) */ -#define _CMP_EQ_US 0x18 /* Equal (unordered, signaling) */ -#define _CMP_NGE_UQ 0x19 /* Not-greater-than-or-equal (unordered, non-signaling) */ -#define _CMP_NGT_UQ 0x1a /* Not-greater-than (unordered, non-signaling) */ -#define _CMP_FALSE_OS 0x1b /* False (ordered, signaling) */ -#define _CMP_NEQ_OS 0x1c /* Not-equal (ordered, signaling) */ -#define _CMP_GE_OQ 0x1d /* Greater-than-or-equal (ordered, non-signaling) */ -#define _CMP_GT_OQ 0x1e /* Greater-than (ordered, non-signaling) */ -#define _CMP_TRUE_US 0x1f /* True (unordered, signaling) */ - -/// Compares each of the corresponding double-precision values of two -/// 128-bit vectors of [2 x double], using the operation specified by the -/// immediate integer operand. -/// -/// Returns a [2 x double] vector consisting of two doubles corresponding to -/// the two comparison results: zero if the comparison is false, and all 1's -/// if the comparison is true. -/// -/// \headerfile -/// -/// \code -/// __m128d _mm_cmp_pd(__m128d a, __m128d b, const int c); -/// \endcode -/// -/// This intrinsic corresponds to the VCMPPD instruction. -/// -/// \param a -/// A 128-bit vector of [2 x double]. -/// \param b -/// A 128-bit vector of [2 x double]. -/// \param c -/// An immediate integer operand, with bits [4:0] specifying which comparison -/// operation to use: \n -/// 0x00: Equal (ordered, non-signaling) \n -/// 0x01: Less-than (ordered, signaling) \n -/// 0x02: Less-than-or-equal (ordered, signaling) \n -/// 0x03: Unordered (non-signaling) \n -/// 0x04: Not-equal (unordered, non-signaling) \n -/// 0x05: Not-less-than (unordered, signaling) \n -/// 0x06: Not-less-than-or-equal (unordered, signaling) \n -/// 0x07: Ordered (non-signaling) \n -/// 0x08: Equal (unordered, non-signaling) \n -/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n -/// 0x0A: Not-greater-than (unordered, signaling) \n -/// 0x0B: False (ordered, non-signaling) \n -/// 0x0C: Not-equal (ordered, non-signaling) \n -/// 0x0D: Greater-than-or-equal (ordered, signaling) \n -/// 0x0E: Greater-than (ordered, signaling) \n -/// 0x0F: True (unordered, non-signaling) \n -/// 0x10: Equal (ordered, signaling) \n -/// 0x11: Less-than (ordered, non-signaling) \n -/// 0x12: Less-than-or-equal (ordered, non-signaling) \n -/// 0x13: Unordered (signaling) \n -/// 0x14: Not-equal (unordered, signaling) \n -/// 0x15: Not-less-than (unordered, non-signaling) \n -/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n -/// 0x17: Ordered (signaling) \n -/// 0x18: Equal (unordered, signaling) \n -/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n -/// 0x1A: Not-greater-than (unordered, non-signaling) \n -/// 0x1B: False (ordered, signaling) \n -/// 0x1C: Not-equal (ordered, signaling) \n -/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n -/// 0x1E: Greater-than (ordered, non-signaling) \n -/// 0x1F: True (unordered, signaling) -/// \returns A 128-bit vector of [2 x double] containing the comparison results. -#define _mm_cmp_pd(a, b, c) \ - ((__m128d)__builtin_ia32_cmppd((__v2df)(__m128d)(a), \ - (__v2df)(__m128d)(b), (c))) - -/// Compares each of the corresponding values of two 128-bit vectors of -/// [4 x float], using the operation specified by the immediate integer -/// operand. -/// -/// Returns a [4 x float] vector consisting of four floats corresponding to -/// the four comparison results: zero if the comparison is false, and all 1's -/// if the comparison is true. -/// -/// \headerfile -/// -/// \code -/// __m128 _mm_cmp_ps(__m128 a, __m128 b, const int c); -/// \endcode -/// -/// This intrinsic corresponds to the VCMPPS instruction. -/// -/// \param a -/// A 128-bit vector of [4 x float]. -/// \param b -/// A 128-bit vector of [4 x float]. -/// \param c -/// An immediate integer operand, with bits [4:0] specifying which comparison -/// operation to use: \n -/// 0x00: Equal (ordered, non-signaling) \n -/// 0x01: Less-than (ordered, signaling) \n -/// 0x02: Less-than-or-equal (ordered, signaling) \n -/// 0x03: Unordered (non-signaling) \n -/// 0x04: Not-equal (unordered, non-signaling) \n -/// 0x05: Not-less-than (unordered, signaling) \n -/// 0x06: Not-less-than-or-equal (unordered, signaling) \n -/// 0x07: Ordered (non-signaling) \n -/// 0x08: Equal (unordered, non-signaling) \n -/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n -/// 0x0A: Not-greater-than (unordered, signaling) \n -/// 0x0B: False (ordered, non-signaling) \n -/// 0x0C: Not-equal (ordered, non-signaling) \n -/// 0x0D: Greater-than-or-equal (ordered, signaling) \n -/// 0x0E: Greater-than (ordered, signaling) \n -/// 0x0F: True (unordered, non-signaling) \n -/// 0x10: Equal (ordered, signaling) \n -/// 0x11: Less-than (ordered, non-signaling) \n -/// 0x12: Less-than-or-equal (ordered, non-signaling) \n -/// 0x13: Unordered (signaling) \n -/// 0x14: Not-equal (unordered, signaling) \n -/// 0x15: Not-less-than (unordered, non-signaling) \n -/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n -/// 0x17: Ordered (signaling) \n -/// 0x18: Equal (unordered, signaling) \n -/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n -/// 0x1A: Not-greater-than (unordered, non-signaling) \n -/// 0x1B: False (ordered, signaling) \n -/// 0x1C: Not-equal (ordered, signaling) \n -/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n -/// 0x1E: Greater-than (ordered, non-signaling) \n -/// 0x1F: True (unordered, signaling) -/// \returns A 128-bit vector of [4 x float] containing the comparison results. -#define _mm_cmp_ps(a, b, c) \ - ((__m128)__builtin_ia32_cmpps((__v4sf)(__m128)(a), \ - (__v4sf)(__m128)(b), (c))) - -/// Compares each of the corresponding double-precision values of two -/// 256-bit vectors of [4 x double], using the operation specified by the -/// immediate integer operand. -/// -/// Returns a [4 x double] vector consisting of four doubles corresponding to -/// the four comparison results: zero if the comparison is false, and all 1's -/// if the comparison is true. -/// -/// \headerfile -/// -/// \code -/// __m256d _mm256_cmp_pd(__m256d a, __m256d b, const int c); -/// \endcode -/// -/// This intrinsic corresponds to the VCMPPD instruction. -/// -/// \param a -/// A 256-bit vector of [4 x double]. -/// \param b -/// A 256-bit vector of [4 x double]. -/// \param c -/// An immediate integer operand, with bits [4:0] specifying which comparison -/// operation to use: \n -/// 0x00: Equal (ordered, non-signaling) \n -/// 0x01: Less-than (ordered, signaling) \n -/// 0x02: Less-than-or-equal (ordered, signaling) \n -/// 0x03: Unordered (non-signaling) \n -/// 0x04: Not-equal (unordered, non-signaling) \n -/// 0x05: Not-less-than (unordered, signaling) \n -/// 0x06: Not-less-than-or-equal (unordered, signaling) \n -/// 0x07: Ordered (non-signaling) \n -/// 0x08: Equal (unordered, non-signaling) \n -/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n -/// 0x0A: Not-greater-than (unordered, signaling) \n -/// 0x0B: False (ordered, non-signaling) \n -/// 0x0C: Not-equal (ordered, non-signaling) \n -/// 0x0D: Greater-than-or-equal (ordered, signaling) \n -/// 0x0E: Greater-than (ordered, signaling) \n -/// 0x0F: True (unordered, non-signaling) \n -/// 0x10: Equal (ordered, signaling) \n -/// 0x11: Less-than (ordered, non-signaling) \n -/// 0x12: Less-than-or-equal (ordered, non-signaling) \n -/// 0x13: Unordered (signaling) \n -/// 0x14: Not-equal (unordered, signaling) \n -/// 0x15: Not-less-than (unordered, non-signaling) \n -/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n -/// 0x17: Ordered (signaling) \n -/// 0x18: Equal (unordered, signaling) \n -/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n -/// 0x1A: Not-greater-than (unordered, non-signaling) \n -/// 0x1B: False (ordered, signaling) \n -/// 0x1C: Not-equal (ordered, signaling) \n -/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n -/// 0x1E: Greater-than (ordered, non-signaling) \n -/// 0x1F: True (unordered, signaling) -/// \returns A 256-bit vector of [4 x double] containing the comparison results. -#define _mm256_cmp_pd(a, b, c) \ - ((__m256d)__builtin_ia32_cmppd256((__v4df)(__m256d)(a), \ - (__v4df)(__m256d)(b), (c))) - -/// Compares each of the corresponding values of two 256-bit vectors of -/// [8 x float], using the operation specified by the immediate integer -/// operand. -/// -/// Returns a [8 x float] vector consisting of eight floats corresponding to -/// the eight comparison results: zero if the comparison is false, and all -/// 1's if the comparison is true. -/// -/// \headerfile -/// -/// \code -/// __m256 _mm256_cmp_ps(__m256 a, __m256 b, const int c); -/// \endcode -/// -/// This intrinsic corresponds to the VCMPPS instruction. -/// -/// \param a -/// A 256-bit vector of [8 x float]. -/// \param b -/// A 256-bit vector of [8 x float]. -/// \param c -/// An immediate integer operand, with bits [4:0] specifying which comparison -/// operation to use: \n -/// 0x00: Equal (ordered, non-signaling) \n -/// 0x01: Less-than (ordered, signaling) \n -/// 0x02: Less-than-or-equal (ordered, signaling) \n -/// 0x03: Unordered (non-signaling) \n -/// 0x04: Not-equal (unordered, non-signaling) \n -/// 0x05: Not-less-than (unordered, signaling) \n -/// 0x06: Not-less-than-or-equal (unordered, signaling) \n -/// 0x07: Ordered (non-signaling) \n -/// 0x08: Equal (unordered, non-signaling) \n -/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n -/// 0x0A: Not-greater-than (unordered, signaling) \n -/// 0x0B: False (ordered, non-signaling) \n -/// 0x0C: Not-equal (ordered, non-signaling) \n -/// 0x0D: Greater-than-or-equal (ordered, signaling) \n -/// 0x0E: Greater-than (ordered, signaling) \n -/// 0x0F: True (unordered, non-signaling) \n -/// 0x10: Equal (ordered, signaling) \n -/// 0x11: Less-than (ordered, non-signaling) \n -/// 0x12: Less-than-or-equal (ordered, non-signaling) \n -/// 0x13: Unordered (signaling) \n -/// 0x14: Not-equal (unordered, signaling) \n -/// 0x15: Not-less-than (unordered, non-signaling) \n -/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n -/// 0x17: Ordered (signaling) \n -/// 0x18: Equal (unordered, signaling) \n -/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n -/// 0x1A: Not-greater-than (unordered, non-signaling) \n -/// 0x1B: False (ordered, signaling) \n -/// 0x1C: Not-equal (ordered, signaling) \n -/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n -/// 0x1E: Greater-than (ordered, non-signaling) \n -/// 0x1F: True (unordered, signaling) -/// \returns A 256-bit vector of [8 x float] containing the comparison results. -#define _mm256_cmp_ps(a, b, c) \ - ((__m256)__builtin_ia32_cmpps256((__v8sf)(__m256)(a), \ - (__v8sf)(__m256)(b), (c))) - -/// Compares each of the corresponding scalar double-precision values of -/// two 128-bit vectors of [2 x double], using the operation specified by the -/// immediate integer operand. -/// -/// If the result is true, all 64 bits of the destination vector are set; -/// otherwise they are cleared. -/// -/// \headerfile -/// -/// \code -/// __m128d _mm_cmp_sd(__m128d a, __m128d b, const int c); -/// \endcode -/// -/// This intrinsic corresponds to the VCMPSD instruction. -/// -/// \param a -/// A 128-bit vector of [2 x double]. -/// \param b -/// A 128-bit vector of [2 x double]. -/// \param c -/// An immediate integer operand, with bits [4:0] specifying which comparison -/// operation to use: \n -/// 0x00: Equal (ordered, non-signaling) \n -/// 0x01: Less-than (ordered, signaling) \n -/// 0x02: Less-than-or-equal (ordered, signaling) \n -/// 0x03: Unordered (non-signaling) \n -/// 0x04: Not-equal (unordered, non-signaling) \n -/// 0x05: Not-less-than (unordered, signaling) \n -/// 0x06: Not-less-than-or-equal (unordered, signaling) \n -/// 0x07: Ordered (non-signaling) \n -/// 0x08: Equal (unordered, non-signaling) \n -/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n -/// 0x0A: Not-greater-than (unordered, signaling) \n -/// 0x0B: False (ordered, non-signaling) \n -/// 0x0C: Not-equal (ordered, non-signaling) \n -/// 0x0D: Greater-than-or-equal (ordered, signaling) \n -/// 0x0E: Greater-than (ordered, signaling) \n -/// 0x0F: True (unordered, non-signaling) \n -/// 0x10: Equal (ordered, signaling) \n -/// 0x11: Less-than (ordered, non-signaling) \n -/// 0x12: Less-than-or-equal (ordered, non-signaling) \n -/// 0x13: Unordered (signaling) \n -/// 0x14: Not-equal (unordered, signaling) \n -/// 0x15: Not-less-than (unordered, non-signaling) \n -/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n -/// 0x17: Ordered (signaling) \n -/// 0x18: Equal (unordered, signaling) \n -/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n -/// 0x1A: Not-greater-than (unordered, non-signaling) \n -/// 0x1B: False (ordered, signaling) \n -/// 0x1C: Not-equal (ordered, signaling) \n -/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n -/// 0x1E: Greater-than (ordered, non-signaling) \n -/// 0x1F: True (unordered, signaling) -/// \returns A 128-bit vector of [2 x double] containing the comparison results. -#define _mm_cmp_sd(a, b, c) \ - ((__m128d)__builtin_ia32_cmpsd((__v2df)(__m128d)(a), \ - (__v2df)(__m128d)(b), (c))) - -/// Compares each of the corresponding scalar values of two 128-bit -/// vectors of [4 x float], using the operation specified by the immediate -/// integer operand. -/// -/// If the result is true, all 32 bits of the destination vector are set; -/// otherwise they are cleared. -/// -/// \headerfile -/// -/// \code -/// __m128 _mm_cmp_ss(__m128 a, __m128 b, const int c); -/// \endcode -/// -/// This intrinsic corresponds to the VCMPSS instruction. -/// -/// \param a -/// A 128-bit vector of [4 x float]. -/// \param b -/// A 128-bit vector of [4 x float]. -/// \param c -/// An immediate integer operand, with bits [4:0] specifying which comparison -/// operation to use: \n -/// 0x00: Equal (ordered, non-signaling) \n -/// 0x01: Less-than (ordered, signaling) \n -/// 0x02: Less-than-or-equal (ordered, signaling) \n -/// 0x03: Unordered (non-signaling) \n -/// 0x04: Not-equal (unordered, non-signaling) \n -/// 0x05: Not-less-than (unordered, signaling) \n -/// 0x06: Not-less-than-or-equal (unordered, signaling) \n -/// 0x07: Ordered (non-signaling) \n -/// 0x08: Equal (unordered, non-signaling) \n -/// 0x09: Not-greater-than-or-equal (unordered, signaling) \n -/// 0x0A: Not-greater-than (unordered, signaling) \n -/// 0x0B: False (ordered, non-signaling) \n -/// 0x0C: Not-equal (ordered, non-signaling) \n -/// 0x0D: Greater-than-or-equal (ordered, signaling) \n -/// 0x0E: Greater-than (ordered, signaling) \n -/// 0x0F: True (unordered, non-signaling) \n -/// 0x10: Equal (ordered, signaling) \n -/// 0x11: Less-than (ordered, non-signaling) \n -/// 0x12: Less-than-or-equal (ordered, non-signaling) \n -/// 0x13: Unordered (signaling) \n -/// 0x14: Not-equal (unordered, signaling) \n -/// 0x15: Not-less-than (unordered, non-signaling) \n -/// 0x16: Not-less-than-or-equal (unordered, non-signaling) \n -/// 0x17: Ordered (signaling) \n -/// 0x18: Equal (unordered, signaling) \n -/// 0x19: Not-greater-than-or-equal (unordered, non-signaling) \n -/// 0x1A: Not-greater-than (unordered, non-signaling) \n -/// 0x1B: False (ordered, signaling) \n -/// 0x1C: Not-equal (ordered, signaling) \n -/// 0x1D: Greater-than-or-equal (ordered, non-signaling) \n -/// 0x1E: Greater-than (ordered, non-signaling) \n -/// 0x1F: True (unordered, signaling) -/// \returns A 128-bit vector of [4 x float] containing the comparison results. -#define _mm_cmp_ss(a, b, c) \ - ((__m128)__builtin_ia32_cmpss((__v4sf)(__m128)(a), \ - (__v4sf)(__m128)(b), (c))) - -/// Takes a [8 x i32] vector and returns the vector element value -/// indexed by the immediate constant operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VEXTRACTF128+COMPOSITE -/// instruction. -/// -/// \param __a -/// A 256-bit vector of [8 x i32]. -/// \param __imm -/// An immediate integer operand with bits [2:0] determining which vector -/// element is extracted and returned. -/// \returns A 32-bit integer containing the extracted 32 bits of extended -/// packed data. -#define _mm256_extract_epi32(X, N) \ - ((int)__builtin_ia32_vec_ext_v8si((__v8si)(__m256i)(X), (int)(N))) - -/// Takes a [16 x i16] vector and returns the vector element value -/// indexed by the immediate constant operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VEXTRACTF128+COMPOSITE -/// instruction. -/// -/// \param __a -/// A 256-bit integer vector of [16 x i16]. -/// \param __imm -/// An immediate integer operand with bits [3:0] determining which vector -/// element is extracted and returned. -/// \returns A 32-bit integer containing the extracted 16 bits of zero extended -/// packed data. -#define _mm256_extract_epi16(X, N) \ - ((int)(unsigned short)__builtin_ia32_vec_ext_v16hi((__v16hi)(__m256i)(X), \ - (int)(N))) - -/// Takes a [32 x i8] vector and returns the vector element value -/// indexed by the immediate constant operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VEXTRACTF128+COMPOSITE -/// instruction. -/// -/// \param __a -/// A 256-bit integer vector of [32 x i8]. -/// \param __imm -/// An immediate integer operand with bits [4:0] determining which vector -/// element is extracted and returned. -/// \returns A 32-bit integer containing the extracted 8 bits of zero extended -/// packed data. -#define _mm256_extract_epi8(X, N) \ - ((int)(unsigned char)__builtin_ia32_vec_ext_v32qi((__v32qi)(__m256i)(X), \ - (int)(N))) - -#ifdef __x86_64__ -/// Takes a [4 x i64] vector and returns the vector element value -/// indexed by the immediate constant operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VEXTRACTF128+COMPOSITE -/// instruction. -/// -/// \param __a -/// A 256-bit integer vector of [4 x i64]. -/// \param __imm -/// An immediate integer operand with bits [1:0] determining which vector -/// element is extracted and returned. -/// \returns A 64-bit integer containing the extracted 64 bits of extended -/// packed data. -#define _mm256_extract_epi64(X, N) \ - ((long long)__builtin_ia32_vec_ext_v4di((__v4di)(__m256i)(X), (int)(N))) -#endif - -/// Takes a [8 x i32] vector and replaces the vector element value -/// indexed by the immediate constant operand by a new value. Returns the -/// modified vector. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VINSERTF128+COMPOSITE -/// instruction. -/// -/// \param __a -/// A vector of [8 x i32] to be used by the insert operation. -/// \param __b -/// An integer value. The replacement value for the insert operation. -/// \param __imm -/// An immediate integer specifying the index of the vector element to be -/// replaced. -/// \returns A copy of vector \a __a, after replacing its element indexed by -/// \a __imm with \a __b. -#define _mm256_insert_epi32(X, I, N) \ - ((__m256i)__builtin_ia32_vec_set_v8si((__v8si)(__m256i)(X), \ - (int)(I), (int)(N))) - - -/// Takes a [16 x i16] vector and replaces the vector element value -/// indexed by the immediate constant operand with a new value. Returns the -/// modified vector. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VINSERTF128+COMPOSITE -/// instruction. -/// -/// \param __a -/// A vector of [16 x i16] to be used by the insert operation. -/// \param __b -/// An i16 integer value. The replacement value for the insert operation. -/// \param __imm -/// An immediate integer specifying the index of the vector element to be -/// replaced. -/// \returns A copy of vector \a __a, after replacing its element indexed by -/// \a __imm with \a __b. -#define _mm256_insert_epi16(X, I, N) \ - ((__m256i)__builtin_ia32_vec_set_v16hi((__v16hi)(__m256i)(X), \ - (int)(I), (int)(N))) - -/// Takes a [32 x i8] vector and replaces the vector element value -/// indexed by the immediate constant operand with a new value. Returns the -/// modified vector. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VINSERTF128+COMPOSITE -/// instruction. -/// -/// \param __a -/// A vector of [32 x i8] to be used by the insert operation. -/// \param __b -/// An i8 integer value. The replacement value for the insert operation. -/// \param __imm -/// An immediate integer specifying the index of the vector element to be -/// replaced. -/// \returns A copy of vector \a __a, after replacing its element indexed by -/// \a __imm with \a __b. -#define _mm256_insert_epi8(X, I, N) \ - ((__m256i)__builtin_ia32_vec_set_v32qi((__v32qi)(__m256i)(X), \ - (int)(I), (int)(N))) - -#ifdef __x86_64__ -/// Takes a [4 x i64] vector and replaces the vector element value -/// indexed by the immediate constant operand with a new value. Returns the -/// modified vector. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VINSERTF128+COMPOSITE -/// instruction. -/// -/// \param __a -/// A vector of [4 x i64] to be used by the insert operation. -/// \param __b -/// A 64-bit integer value. The replacement value for the insert operation. -/// \param __imm -/// An immediate integer specifying the index of the vector element to be -/// replaced. -/// \returns A copy of vector \a __a, after replacing its element indexed by -/// \a __imm with \a __b. -#define _mm256_insert_epi64(X, I, N) \ - ((__m256i)__builtin_ia32_vec_set_v4di((__v4di)(__m256i)(X), \ - (long long)(I), (int)(N))) -#endif - -/* Conversion */ -/// Converts a vector of [4 x i32] into a vector of [4 x double]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCVTDQ2PD instruction. -/// -/// \param __a -/// A 128-bit integer vector of [4 x i32]. -/// \returns A 256-bit vector of [4 x double] containing the converted values. -static __inline __m256d __DEFAULT_FN_ATTRS -_mm256_cvtepi32_pd(__m128i __a) -{ - return (__m256d)__builtin_convertvector((__v4si)__a, __v4df); -} - -/// Converts a vector of [8 x i32] into a vector of [8 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCVTDQ2PS instruction. -/// -/// \param __a -/// A 256-bit integer vector. -/// \returns A 256-bit vector of [8 x float] containing the converted values. -static __inline __m256 __DEFAULT_FN_ATTRS -_mm256_cvtepi32_ps(__m256i __a) -{ - return (__m256)__builtin_convertvector((__v8si)__a, __v8sf); -} - -/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of -/// [4 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCVTPD2PS instruction. -/// -/// \param __a -/// A 256-bit vector of [4 x double]. -/// \returns A 128-bit vector of [4 x float] containing the converted values. -static __inline __m128 __DEFAULT_FN_ATTRS -_mm256_cvtpd_ps(__m256d __a) -{ - return (__m128)__builtin_ia32_cvtpd2ps256((__v4df) __a); -} - -/// Converts a vector of [8 x float] into a vector of [8 x i32]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCVTPS2DQ instruction. -/// -/// \param __a -/// A 256-bit vector of [8 x float]. -/// \returns A 256-bit integer vector containing the converted values. -static __inline __m256i __DEFAULT_FN_ATTRS -_mm256_cvtps_epi32(__m256 __a) -{ - return (__m256i)__builtin_ia32_cvtps2dq256((__v8sf) __a); -} - -/// Converts a 128-bit vector of [4 x float] into a 256-bit vector of [4 -/// x double]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCVTPS2PD instruction. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. -/// \returns A 256-bit vector of [4 x double] containing the converted values. -static __inline __m256d __DEFAULT_FN_ATTRS -_mm256_cvtps_pd(__m128 __a) -{ - return (__m256d)__builtin_convertvector((__v4sf)__a, __v4df); -} - -/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4 -/// x i32], truncating the result by rounding towards zero when it is -/// inexact. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCVTTPD2DQ instruction. -/// -/// \param __a -/// A 256-bit vector of [4 x double]. -/// \returns A 128-bit integer vector containing the converted values. -static __inline __m128i __DEFAULT_FN_ATTRS -_mm256_cvttpd_epi32(__m256d __a) -{ - return (__m128i)__builtin_ia32_cvttpd2dq256((__v4df) __a); -} - -/// Converts a 256-bit vector of [4 x double] into a 128-bit vector of [4 -/// x i32]. When a conversion is inexact, the value returned is rounded -/// according to the rounding control bits in the MXCSR register. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCVTPD2DQ instruction. -/// -/// \param __a -/// A 256-bit vector of [4 x double]. -/// \returns A 128-bit integer vector containing the converted values. -static __inline __m128i __DEFAULT_FN_ATTRS -_mm256_cvtpd_epi32(__m256d __a) -{ - return (__m128i)__builtin_ia32_cvtpd2dq256((__v4df) __a); -} - -/// Converts a vector of [8 x float] into a vector of [8 x i32], -/// truncating the result by rounding towards zero when it is inexact. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCVTTPS2DQ instruction. -/// -/// \param __a -/// A 256-bit vector of [8 x float]. -/// \returns A 256-bit integer vector containing the converted values. -static __inline __m256i __DEFAULT_FN_ATTRS -_mm256_cvttps_epi32(__m256 __a) -{ - return (__m256i)__builtin_ia32_cvttps2dq256((__v8sf) __a); -} - -/// Returns the first element of the input vector of [4 x double]. -/// -/// \headerfile -/// -/// This intrinsic is a utility function and does not correspond to a specific -/// instruction. -/// -/// \param __a -/// A 256-bit vector of [4 x double]. -/// \returns A 64 bit double containing the first element of the input vector. -static __inline double __DEFAULT_FN_ATTRS -_mm256_cvtsd_f64(__m256d __a) -{ - return __a[0]; -} - -/// Returns the first element of the input vector of [8 x i32]. -/// -/// \headerfile -/// -/// This intrinsic is a utility function and does not correspond to a specific -/// instruction. -/// -/// \param __a -/// A 256-bit vector of [8 x i32]. -/// \returns A 32 bit integer containing the first element of the input vector. -static __inline int __DEFAULT_FN_ATTRS -_mm256_cvtsi256_si32(__m256i __a) -{ - __v8si __b = (__v8si)__a; - return __b[0]; -} - -/// Returns the first element of the input vector of [8 x float]. -/// -/// \headerfile -/// -/// This intrinsic is a utility function and does not correspond to a specific -/// instruction. -/// -/// \param __a -/// A 256-bit vector of [8 x float]. -/// \returns A 32 bit float containing the first element of the input vector. -static __inline float __DEFAULT_FN_ATTRS -_mm256_cvtss_f32(__m256 __a) -{ - return __a[0]; -} - -/* Vector replicate */ -/// Moves and duplicates odd-indexed values from a 256-bit vector of -/// [8 x float] to float values in a 256-bit vector of [8 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVSHDUP instruction. -/// -/// \param __a -/// A 256-bit vector of [8 x float]. \n -/// Bits [255:224] of \a __a are written to bits [255:224] and [223:192] of -/// the return value. \n -/// Bits [191:160] of \a __a are written to bits [191:160] and [159:128] of -/// the return value. \n -/// Bits [127:96] of \a __a are written to bits [127:96] and [95:64] of the -/// return value. \n -/// Bits [63:32] of \a __a are written to bits [63:32] and [31:0] of the -/// return value. -/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated -/// values. -static __inline __m256 __DEFAULT_FN_ATTRS -_mm256_movehdup_ps(__m256 __a) -{ - return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 1, 1, 3, 3, 5, 5, 7, 7); -} - -/// Moves and duplicates even-indexed values from a 256-bit vector of -/// [8 x float] to float values in a 256-bit vector of [8 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVSLDUP instruction. -/// -/// \param __a -/// A 256-bit vector of [8 x float]. \n -/// Bits [223:192] of \a __a are written to bits [255:224] and [223:192] of -/// the return value. \n -/// Bits [159:128] of \a __a are written to bits [191:160] and [159:128] of -/// the return value. \n -/// Bits [95:64] of \a __a are written to bits [127:96] and [95:64] of the -/// return value. \n -/// Bits [31:0] of \a __a are written to bits [63:32] and [31:0] of the -/// return value. -/// \returns A 256-bit vector of [8 x float] containing the moved and duplicated -/// values. -static __inline __m256 __DEFAULT_FN_ATTRS -_mm256_moveldup_ps(__m256 __a) -{ - return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 0, 2, 2, 4, 4, 6, 6); -} - -/// Moves and duplicates double-precision floating point values from a -/// 256-bit vector of [4 x double] to double-precision values in a 256-bit -/// vector of [4 x double]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVDDUP instruction. -/// -/// \param __a -/// A 256-bit vector of [4 x double]. \n -/// Bits [63:0] of \a __a are written to bits [127:64] and [63:0] of the -/// return value. \n -/// Bits [191:128] of \a __a are written to bits [255:192] and [191:128] of -/// the return value. -/// \returns A 256-bit vector of [4 x double] containing the moved and -/// duplicated values. -static __inline __m256d __DEFAULT_FN_ATTRS -_mm256_movedup_pd(__m256d __a) -{ - return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 0, 2, 2); -} - -/* Unpack and Interleave */ -/// Unpacks the odd-indexed vector elements from two 256-bit vectors of -/// [4 x double] and interleaves them into a 256-bit vector of [4 x double]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VUNPCKHPD instruction. -/// -/// \param __a -/// A 256-bit floating-point vector of [4 x double]. \n -/// Bits [127:64] are written to bits [63:0] of the return value. \n -/// Bits [255:192] are written to bits [191:128] of the return value. \n -/// \param __b -/// A 256-bit floating-point vector of [4 x double]. \n -/// Bits [127:64] are written to bits [127:64] of the return value. \n -/// Bits [255:192] are written to bits [255:192] of the return value. \n -/// \returns A 256-bit vector of [4 x double] containing the interleaved values. -static __inline __m256d __DEFAULT_FN_ATTRS -_mm256_unpackhi_pd(__m256d __a, __m256d __b) -{ - return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 1, 5, 1+2, 5+2); -} - -/// Unpacks the even-indexed vector elements from two 256-bit vectors of -/// [4 x double] and interleaves them into a 256-bit vector of [4 x double]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VUNPCKLPD instruction. -/// -/// \param __a -/// A 256-bit floating-point vector of [4 x double]. \n -/// Bits [63:0] are written to bits [63:0] of the return value. \n -/// Bits [191:128] are written to bits [191:128] of the return value. -/// \param __b -/// A 256-bit floating-point vector of [4 x double]. \n -/// Bits [63:0] are written to bits [127:64] of the return value. \n -/// Bits [191:128] are written to bits [255:192] of the return value. \n -/// \returns A 256-bit vector of [4 x double] containing the interleaved values. -static __inline __m256d __DEFAULT_FN_ATTRS -_mm256_unpacklo_pd(__m256d __a, __m256d __b) -{ - return __builtin_shufflevector((__v4df)__a, (__v4df)__b, 0, 4, 0+2, 4+2); -} - -/// Unpacks the 32-bit vector elements 2, 3, 6 and 7 from each of the -/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit -/// vector of [8 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VUNPCKHPS instruction. -/// -/// \param __a -/// A 256-bit vector of [8 x float]. \n -/// Bits [95:64] are written to bits [31:0] of the return value. \n -/// Bits [127:96] are written to bits [95:64] of the return value. \n -/// Bits [223:192] are written to bits [159:128] of the return value. \n -/// Bits [255:224] are written to bits [223:192] of the return value. -/// \param __b -/// A 256-bit vector of [8 x float]. \n -/// Bits [95:64] are written to bits [63:32] of the return value. \n -/// Bits [127:96] are written to bits [127:96] of the return value. \n -/// Bits [223:192] are written to bits [191:160] of the return value. \n -/// Bits [255:224] are written to bits [255:224] of the return value. -/// \returns A 256-bit vector of [8 x float] containing the interleaved values. -static __inline __m256 __DEFAULT_FN_ATTRS -_mm256_unpackhi_ps(__m256 __a, __m256 __b) -{ - return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 2, 10, 2+1, 10+1, 6, 14, 6+1, 14+1); -} - -/// Unpacks the 32-bit vector elements 0, 1, 4 and 5 from each of the -/// two 256-bit vectors of [8 x float] and interleaves them into a 256-bit -/// vector of [8 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VUNPCKLPS instruction. -/// -/// \param __a -/// A 256-bit vector of [8 x float]. \n -/// Bits [31:0] are written to bits [31:0] of the return value. \n -/// Bits [63:32] are written to bits [95:64] of the return value. \n -/// Bits [159:128] are written to bits [159:128] of the return value. \n -/// Bits [191:160] are written to bits [223:192] of the return value. -/// \param __b -/// A 256-bit vector of [8 x float]. \n -/// Bits [31:0] are written to bits [63:32] of the return value. \n -/// Bits [63:32] are written to bits [127:96] of the return value. \n -/// Bits [159:128] are written to bits [191:160] of the return value. \n -/// Bits [191:160] are written to bits [255:224] of the return value. -/// \returns A 256-bit vector of [8 x float] containing the interleaved values. -static __inline __m256 __DEFAULT_FN_ATTRS -_mm256_unpacklo_ps(__m256 __a, __m256 __b) -{ - return __builtin_shufflevector((__v8sf)__a, (__v8sf)__b, 0, 8, 0+1, 8+1, 4, 12, 4+1, 12+1); -} - -/* Bit Test */ -/// Given two 128-bit floating-point vectors of [2 x double], perform an -/// element-by-element comparison of the double-precision element in the -/// first source vector and the corresponding element in the second source -/// vector. -/// -/// The EFLAGS register is updated as follows: \n -/// If there is at least one pair of double-precision elements where the -/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the -/// ZF flag is set to 1. \n -/// If there is at least one pair of double-precision elements where the -/// sign-bit of the first element is 0 and the sign-bit of the second element -/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n -/// This intrinsic returns the value of the ZF flag. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VTESTPD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. -/// \param __b -/// A 128-bit vector of [2 x double]. -/// \returns the ZF flag in the EFLAGS register. -static __inline int __DEFAULT_FN_ATTRS128 -_mm_testz_pd(__m128d __a, __m128d __b) -{ - return __builtin_ia32_vtestzpd((__v2df)__a, (__v2df)__b); -} - -/// Given two 128-bit floating-point vectors of [2 x double], perform an -/// element-by-element comparison of the double-precision element in the -/// first source vector and the corresponding element in the second source -/// vector. -/// -/// The EFLAGS register is updated as follows: \n -/// If there is at least one pair of double-precision elements where the -/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the -/// ZF flag is set to 1. \n -/// If there is at least one pair of double-precision elements where the -/// sign-bit of the first element is 0 and the sign-bit of the second element -/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n -/// This intrinsic returns the value of the CF flag. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VTESTPD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. -/// \param __b -/// A 128-bit vector of [2 x double]. -/// \returns the CF flag in the EFLAGS register. -static __inline int __DEFAULT_FN_ATTRS128 -_mm_testc_pd(__m128d __a, __m128d __b) -{ - return __builtin_ia32_vtestcpd((__v2df)__a, (__v2df)__b); -} - -/// Given two 128-bit floating-point vectors of [2 x double], perform an -/// element-by-element comparison of the double-precision element in the -/// first source vector and the corresponding element in the second source -/// vector. -/// -/// The EFLAGS register is updated as follows: \n -/// If there is at least one pair of double-precision elements where the -/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the -/// ZF flag is set to 1. \n -/// If there is at least one pair of double-precision elements where the -/// sign-bit of the first element is 0 and the sign-bit of the second element -/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n -/// This intrinsic returns 1 if both the ZF and CF flags are set to 0, -/// otherwise it returns 0. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VTESTPD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. -/// \param __b -/// A 128-bit vector of [2 x double]. -/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. -static __inline int __DEFAULT_FN_ATTRS128 -_mm_testnzc_pd(__m128d __a, __m128d __b) -{ - return __builtin_ia32_vtestnzcpd((__v2df)__a, (__v2df)__b); -} - -/// Given two 128-bit floating-point vectors of [4 x float], perform an -/// element-by-element comparison of the single-precision element in the -/// first source vector and the corresponding element in the second source -/// vector. -/// -/// The EFLAGS register is updated as follows: \n -/// If there is at least one pair of single-precision elements where the -/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the -/// ZF flag is set to 1. \n -/// If there is at least one pair of single-precision elements where the -/// sign-bit of the first element is 0 and the sign-bit of the second element -/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n -/// This intrinsic returns the value of the ZF flag. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VTESTPS instruction. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. -/// \param __b -/// A 128-bit vector of [4 x float]. -/// \returns the ZF flag. -static __inline int __DEFAULT_FN_ATTRS128 -_mm_testz_ps(__m128 __a, __m128 __b) -{ - return __builtin_ia32_vtestzps((__v4sf)__a, (__v4sf)__b); -} - -/// Given two 128-bit floating-point vectors of [4 x float], perform an -/// element-by-element comparison of the single-precision element in the -/// first source vector and the corresponding element in the second source -/// vector. -/// -/// The EFLAGS register is updated as follows: \n -/// If there is at least one pair of single-precision elements where the -/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the -/// ZF flag is set to 1. \n -/// If there is at least one pair of single-precision elements where the -/// sign-bit of the first element is 0 and the sign-bit of the second element -/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n -/// This intrinsic returns the value of the CF flag. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VTESTPS instruction. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. -/// \param __b -/// A 128-bit vector of [4 x float]. -/// \returns the CF flag. -static __inline int __DEFAULT_FN_ATTRS128 -_mm_testc_ps(__m128 __a, __m128 __b) -{ - return __builtin_ia32_vtestcps((__v4sf)__a, (__v4sf)__b); -} - -/// Given two 128-bit floating-point vectors of [4 x float], perform an -/// element-by-element comparison of the single-precision element in the -/// first source vector and the corresponding element in the second source -/// vector. -/// -/// The EFLAGS register is updated as follows: \n -/// If there is at least one pair of single-precision elements where the -/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the -/// ZF flag is set to 1. \n -/// If there is at least one pair of single-precision elements where the -/// sign-bit of the first element is 0 and the sign-bit of the second element -/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n -/// This intrinsic returns 1 if both the ZF and CF flags are set to 0, -/// otherwise it returns 0. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VTESTPS instruction. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. -/// \param __b -/// A 128-bit vector of [4 x float]. -/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. -static __inline int __DEFAULT_FN_ATTRS128 -_mm_testnzc_ps(__m128 __a, __m128 __b) -{ - return __builtin_ia32_vtestnzcps((__v4sf)__a, (__v4sf)__b); -} - -/// Given two 256-bit floating-point vectors of [4 x double], perform an -/// element-by-element comparison of the double-precision elements in the -/// first source vector and the corresponding elements in the second source -/// vector. -/// -/// The EFLAGS register is updated as follows: \n -/// If there is at least one pair of double-precision elements where the -/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the -/// ZF flag is set to 1. \n -/// If there is at least one pair of double-precision elements where the -/// sign-bit of the first element is 0 and the sign-bit of the second element -/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n -/// This intrinsic returns the value of the ZF flag. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VTESTPD instruction. -/// -/// \param __a -/// A 256-bit vector of [4 x double]. -/// \param __b -/// A 256-bit vector of [4 x double]. -/// \returns the ZF flag. -static __inline int __DEFAULT_FN_ATTRS -_mm256_testz_pd(__m256d __a, __m256d __b) -{ - return __builtin_ia32_vtestzpd256((__v4df)__a, (__v4df)__b); -} - -/// Given two 256-bit floating-point vectors of [4 x double], perform an -/// element-by-element comparison of the double-precision elements in the -/// first source vector and the corresponding elements in the second source -/// vector. -/// -/// The EFLAGS register is updated as follows: \n -/// If there is at least one pair of double-precision elements where the -/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the -/// ZF flag is set to 1. \n -/// If there is at least one pair of double-precision elements where the -/// sign-bit of the first element is 0 and the sign-bit of the second element -/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n -/// This intrinsic returns the value of the CF flag. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VTESTPD instruction. -/// -/// \param __a -/// A 256-bit vector of [4 x double]. -/// \param __b -/// A 256-bit vector of [4 x double]. -/// \returns the CF flag. -static __inline int __DEFAULT_FN_ATTRS -_mm256_testc_pd(__m256d __a, __m256d __b) -{ - return __builtin_ia32_vtestcpd256((__v4df)__a, (__v4df)__b); -} - -/// Given two 256-bit floating-point vectors of [4 x double], perform an -/// element-by-element comparison of the double-precision elements in the -/// first source vector and the corresponding elements in the second source -/// vector. -/// -/// The EFLAGS register is updated as follows: \n -/// If there is at least one pair of double-precision elements where the -/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the -/// ZF flag is set to 1. \n -/// If there is at least one pair of double-precision elements where the -/// sign-bit of the first element is 0 and the sign-bit of the second element -/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n -/// This intrinsic returns 1 if both the ZF and CF flags are set to 0, -/// otherwise it returns 0. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VTESTPD instruction. -/// -/// \param __a -/// A 256-bit vector of [4 x double]. -/// \param __b -/// A 256-bit vector of [4 x double]. -/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. -static __inline int __DEFAULT_FN_ATTRS -_mm256_testnzc_pd(__m256d __a, __m256d __b) -{ - return __builtin_ia32_vtestnzcpd256((__v4df)__a, (__v4df)__b); -} - -/// Given two 256-bit floating-point vectors of [8 x float], perform an -/// element-by-element comparison of the single-precision element in the -/// first source vector and the corresponding element in the second source -/// vector. -/// -/// The EFLAGS register is updated as follows: \n -/// If there is at least one pair of single-precision elements where the -/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the -/// ZF flag is set to 1. \n -/// If there is at least one pair of single-precision elements where the -/// sign-bit of the first element is 0 and the sign-bit of the second element -/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n -/// This intrinsic returns the value of the ZF flag. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VTESTPS instruction. -/// -/// \param __a -/// A 256-bit vector of [8 x float]. -/// \param __b -/// A 256-bit vector of [8 x float]. -/// \returns the ZF flag. -static __inline int __DEFAULT_FN_ATTRS -_mm256_testz_ps(__m256 __a, __m256 __b) -{ - return __builtin_ia32_vtestzps256((__v8sf)__a, (__v8sf)__b); -} - -/// Given two 256-bit floating-point vectors of [8 x float], perform an -/// element-by-element comparison of the single-precision element in the -/// first source vector and the corresponding element in the second source -/// vector. -/// -/// The EFLAGS register is updated as follows: \n -/// If there is at least one pair of single-precision elements where the -/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the -/// ZF flag is set to 1. \n -/// If there is at least one pair of single-precision elements where the -/// sign-bit of the first element is 0 and the sign-bit of the second element -/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n -/// This intrinsic returns the value of the CF flag. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VTESTPS instruction. -/// -/// \param __a -/// A 256-bit vector of [8 x float]. -/// \param __b -/// A 256-bit vector of [8 x float]. -/// \returns the CF flag. -static __inline int __DEFAULT_FN_ATTRS -_mm256_testc_ps(__m256 __a, __m256 __b) -{ - return __builtin_ia32_vtestcps256((__v8sf)__a, (__v8sf)__b); -} - -/// Given two 256-bit floating-point vectors of [8 x float], perform an -/// element-by-element comparison of the single-precision elements in the -/// first source vector and the corresponding elements in the second source -/// vector. -/// -/// The EFLAGS register is updated as follows: \n -/// If there is at least one pair of single-precision elements where the -/// sign-bits of both elements are 1, the ZF flag is set to 0. Otherwise the -/// ZF flag is set to 1. \n -/// If there is at least one pair of single-precision elements where the -/// sign-bit of the first element is 0 and the sign-bit of the second element -/// is 1, the CF flag is set to 0. Otherwise the CF flag is set to 1. \n -/// This intrinsic returns 1 if both the ZF and CF flags are set to 0, -/// otherwise it returns 0. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VTESTPS instruction. -/// -/// \param __a -/// A 256-bit vector of [8 x float]. -/// \param __b -/// A 256-bit vector of [8 x float]. -/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. -static __inline int __DEFAULT_FN_ATTRS -_mm256_testnzc_ps(__m256 __a, __m256 __b) -{ - return __builtin_ia32_vtestnzcps256((__v8sf)__a, (__v8sf)__b); -} - -/// Given two 256-bit integer vectors, perform a bit-by-bit comparison -/// of the two source vectors. -/// -/// The EFLAGS register is updated as follows: \n -/// If there is at least one pair of bits where both bits are 1, the ZF flag -/// is set to 0. Otherwise the ZF flag is set to 1. \n -/// If there is at least one pair of bits where the bit from the first source -/// vector is 0 and the bit from the second source vector is 1, the CF flag -/// is set to 0. Otherwise the CF flag is set to 1. \n -/// This intrinsic returns the value of the ZF flag. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPTEST instruction. -/// -/// \param __a -/// A 256-bit integer vector. -/// \param __b -/// A 256-bit integer vector. -/// \returns the ZF flag. -static __inline int __DEFAULT_FN_ATTRS -_mm256_testz_si256(__m256i __a, __m256i __b) -{ - return __builtin_ia32_ptestz256((__v4di)__a, (__v4di)__b); -} - -/// Given two 256-bit integer vectors, perform a bit-by-bit comparison -/// of the two source vectors. -/// -/// The EFLAGS register is updated as follows: \n -/// If there is at least one pair of bits where both bits are 1, the ZF flag -/// is set to 0. Otherwise the ZF flag is set to 1. \n -/// If there is at least one pair of bits where the bit from the first source -/// vector is 0 and the bit from the second source vector is 1, the CF flag -/// is set to 0. Otherwise the CF flag is set to 1. \n -/// This intrinsic returns the value of the CF flag. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPTEST instruction. -/// -/// \param __a -/// A 256-bit integer vector. -/// \param __b -/// A 256-bit integer vector. -/// \returns the CF flag. -static __inline int __DEFAULT_FN_ATTRS -_mm256_testc_si256(__m256i __a, __m256i __b) -{ - return __builtin_ia32_ptestc256((__v4di)__a, (__v4di)__b); -} - -/// Given two 256-bit integer vectors, perform a bit-by-bit comparison -/// of the two source vectors. -/// -/// The EFLAGS register is updated as follows: \n -/// If there is at least one pair of bits where both bits are 1, the ZF flag -/// is set to 0. Otherwise the ZF flag is set to 1. \n -/// If there is at least one pair of bits where the bit from the first source -/// vector is 0 and the bit from the second source vector is 1, the CF flag -/// is set to 0. Otherwise the CF flag is set to 1. \n -/// This intrinsic returns 1 if both the ZF and CF flags are set to 0, -/// otherwise it returns 0. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPTEST instruction. -/// -/// \param __a -/// A 256-bit integer vector. -/// \param __b -/// A 256-bit integer vector. -/// \returns 1 if both the ZF and CF flags are set to 0, otherwise returns 0. -static __inline int __DEFAULT_FN_ATTRS -_mm256_testnzc_si256(__m256i __a, __m256i __b) -{ - return __builtin_ia32_ptestnzc256((__v4di)__a, (__v4di)__b); -} - -/* Vector extract sign mask */ -/// Extracts the sign bits of double-precision floating point elements -/// in a 256-bit vector of [4 x double] and writes them to the lower order -/// bits of the return value. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVMSKPD instruction. -/// -/// \param __a -/// A 256-bit vector of [4 x double] containing the double-precision -/// floating point values with sign bits to be extracted. -/// \returns The sign bits from the operand, written to bits [3:0]. -static __inline int __DEFAULT_FN_ATTRS -_mm256_movemask_pd(__m256d __a) -{ - return __builtin_ia32_movmskpd256((__v4df)__a); -} - -/// Extracts the sign bits of single-precision floating point elements -/// in a 256-bit vector of [8 x float] and writes them to the lower order -/// bits of the return value. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVMSKPS instruction. -/// -/// \param __a -/// A 256-bit vector of [8 x float] containing the single-precision floating -/// point values with sign bits to be extracted. -/// \returns The sign bits from the operand, written to bits [7:0]. -static __inline int __DEFAULT_FN_ATTRS -_mm256_movemask_ps(__m256 __a) -{ - return __builtin_ia32_movmskps256((__v8sf)__a); -} - -/* Vector __zero */ -/// Zeroes the contents of all XMM or YMM registers. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VZEROALL instruction. -static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx"))) -_mm256_zeroall(void) -{ - __builtin_ia32_vzeroall(); -} - -/// Zeroes the upper 128 bits (bits 255:128) of all YMM registers. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VZEROUPPER instruction. -static __inline void __attribute__((__always_inline__, __nodebug__, __target__("avx"))) -_mm256_zeroupper(void) -{ - __builtin_ia32_vzeroupper(); -} - -/* Vector load with broadcast */ -/// Loads a scalar single-precision floating point value from the -/// specified address pointed to by \a __a and broadcasts it to the elements -/// of a [4 x float] vector. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VBROADCASTSS instruction. -/// -/// \param __a -/// The single-precision floating point value to be broadcast. -/// \returns A 128-bit vector of [4 x float] whose 32-bit elements are set -/// equal to the broadcast value. -static __inline __m128 __DEFAULT_FN_ATTRS128 -_mm_broadcast_ss(float const *__a) -{ - float __f = *__a; - return __extension__ (__m128)(__v4sf){ __f, __f, __f, __f }; -} - -/// Loads a scalar double-precision floating point value from the -/// specified address pointed to by \a __a and broadcasts it to the elements -/// of a [4 x double] vector. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VBROADCASTSD instruction. -/// -/// \param __a -/// The double-precision floating point value to be broadcast. -/// \returns A 256-bit vector of [4 x double] whose 64-bit elements are set -/// equal to the broadcast value. -static __inline __m256d __DEFAULT_FN_ATTRS -_mm256_broadcast_sd(double const *__a) -{ - double __d = *__a; - return __extension__ (__m256d)(__v4df){ __d, __d, __d, __d }; -} - -/// Loads a scalar single-precision floating point value from the -/// specified address pointed to by \a __a and broadcasts it to the elements -/// of a [8 x float] vector. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VBROADCASTSS instruction. -/// -/// \param __a -/// The single-precision floating point value to be broadcast. -/// \returns A 256-bit vector of [8 x float] whose 32-bit elements are set -/// equal to the broadcast value. -static __inline __m256 __DEFAULT_FN_ATTRS -_mm256_broadcast_ss(float const *__a) -{ - float __f = *__a; - return __extension__ (__m256)(__v8sf){ __f, __f, __f, __f, __f, __f, __f, __f }; -} - -/// Loads the data from a 128-bit vector of [2 x double] from the -/// specified address pointed to by \a __a and broadcasts it to 128-bit -/// elements in a 256-bit vector of [4 x double]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VBROADCASTF128 instruction. -/// -/// \param __a -/// The 128-bit vector of [2 x double] to be broadcast. -/// \returns A 256-bit vector of [4 x double] whose 128-bit elements are set -/// equal to the broadcast value. -static __inline __m256d __DEFAULT_FN_ATTRS -_mm256_broadcast_pd(__m128d const *__a) -{ - __m128d __b = _mm_loadu_pd((const double *)__a); - return (__m256d)__builtin_shufflevector((__v2df)__b, (__v2df)__b, - 0, 1, 0, 1); -} - -/// Loads the data from a 128-bit vector of [4 x float] from the -/// specified address pointed to by \a __a and broadcasts it to 128-bit -/// elements in a 256-bit vector of [8 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VBROADCASTF128 instruction. -/// -/// \param __a -/// The 128-bit vector of [4 x float] to be broadcast. -/// \returns A 256-bit vector of [8 x float] whose 128-bit elements are set -/// equal to the broadcast value. -static __inline __m256 __DEFAULT_FN_ATTRS -_mm256_broadcast_ps(__m128 const *__a) -{ - __m128 __b = _mm_loadu_ps((const float *)__a); - return (__m256)__builtin_shufflevector((__v4sf)__b, (__v4sf)__b, - 0, 1, 2, 3, 0, 1, 2, 3); -} - -/* SIMD load ops */ -/// Loads 4 double-precision floating point values from a 32-byte aligned -/// memory location pointed to by \a __p into a vector of [4 x double]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVAPD instruction. -/// -/// \param __p -/// A 32-byte aligned pointer to a memory location containing -/// double-precision floating point values. -/// \returns A 256-bit vector of [4 x double] containing the moved values. -static __inline __m256d __DEFAULT_FN_ATTRS -_mm256_load_pd(double const *__p) -{ - return *(const __m256d *)__p; -} - -/// Loads 8 single-precision floating point values from a 32-byte aligned -/// memory location pointed to by \a __p into a vector of [8 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVAPS instruction. -/// -/// \param __p -/// A 32-byte aligned pointer to a memory location containing float values. -/// \returns A 256-bit vector of [8 x float] containing the moved values. -static __inline __m256 __DEFAULT_FN_ATTRS -_mm256_load_ps(float const *__p) -{ - return *(const __m256 *)__p; -} - -/// Loads 4 double-precision floating point values from an unaligned -/// memory location pointed to by \a __p into a vector of [4 x double]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVUPD instruction. -/// -/// \param __p -/// A pointer to a memory location containing double-precision floating -/// point values. -/// \returns A 256-bit vector of [4 x double] containing the moved values. -static __inline __m256d __DEFAULT_FN_ATTRS -_mm256_loadu_pd(double const *__p) -{ - struct __loadu_pd { - __m256d_u __v; - } __attribute__((__packed__, __may_alias__)); - return ((const struct __loadu_pd*)__p)->__v; -} - -/// Loads 8 single-precision floating point values from an unaligned -/// memory location pointed to by \a __p into a vector of [8 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVUPS instruction. -/// -/// \param __p -/// A pointer to a memory location containing single-precision floating -/// point values. -/// \returns A 256-bit vector of [8 x float] containing the moved values. -static __inline __m256 __DEFAULT_FN_ATTRS -_mm256_loadu_ps(float const *__p) -{ - struct __loadu_ps { - __m256_u __v; - } __attribute__((__packed__, __may_alias__)); - return ((const struct __loadu_ps*)__p)->__v; -} - -/// Loads 256 bits of integer data from a 32-byte aligned memory -/// location pointed to by \a __p into elements of a 256-bit integer vector. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVDQA instruction. -/// -/// \param __p -/// A 32-byte aligned pointer to a 256-bit integer vector containing integer -/// values. -/// \returns A 256-bit integer vector containing the moved values. -static __inline __m256i __DEFAULT_FN_ATTRS -_mm256_load_si256(__m256i const *__p) -{ - return *__p; -} - -/// Loads 256 bits of integer data from an unaligned memory location -/// pointed to by \a __p into a 256-bit integer vector. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVDQU instruction. -/// -/// \param __p -/// A pointer to a 256-bit integer vector containing integer values. -/// \returns A 256-bit integer vector containing the moved values. -static __inline __m256i __DEFAULT_FN_ATTRS -_mm256_loadu_si256(__m256i_u const *__p) -{ - struct __loadu_si256 { - __m256i_u __v; - } __attribute__((__packed__, __may_alias__)); - return ((const struct __loadu_si256*)__p)->__v; -} - -/// Loads 256 bits of integer data from an unaligned memory location -/// pointed to by \a __p into a 256-bit integer vector. This intrinsic may -/// perform better than \c _mm256_loadu_si256 when the data crosses a cache -/// line boundary. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VLDDQU instruction. -/// -/// \param __p -/// A pointer to a 256-bit integer vector containing integer values. -/// \returns A 256-bit integer vector containing the moved values. -static __inline __m256i __DEFAULT_FN_ATTRS -_mm256_lddqu_si256(__m256i const *__p) -{ - return (__m256i)__builtin_ia32_lddqu256((char const *)__p); -} - -/* SIMD store ops */ -/// Stores double-precision floating point values from a 256-bit vector -/// of [4 x double] to a 32-byte aligned memory location pointed to by -/// \a __p. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVAPD instruction. -/// -/// \param __p -/// A 32-byte aligned pointer to a memory location that will receive the -/// double-precision floaing point values. -/// \param __a -/// A 256-bit vector of [4 x double] containing the values to be moved. -static __inline void __DEFAULT_FN_ATTRS -_mm256_store_pd(double *__p, __m256d __a) -{ - *(__m256d *)__p = __a; -} - -/// Stores single-precision floating point values from a 256-bit vector -/// of [8 x float] to a 32-byte aligned memory location pointed to by \a __p. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVAPS instruction. -/// -/// \param __p -/// A 32-byte aligned pointer to a memory location that will receive the -/// float values. -/// \param __a -/// A 256-bit vector of [8 x float] containing the values to be moved. -static __inline void __DEFAULT_FN_ATTRS -_mm256_store_ps(float *__p, __m256 __a) -{ - *(__m256 *)__p = __a; -} - -/// Stores double-precision floating point values from a 256-bit vector -/// of [4 x double] to an unaligned memory location pointed to by \a __p. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVUPD instruction. -/// -/// \param __p -/// A pointer to a memory location that will receive the double-precision -/// floating point values. -/// \param __a -/// A 256-bit vector of [4 x double] containing the values to be moved. -static __inline void __DEFAULT_FN_ATTRS -_mm256_storeu_pd(double *__p, __m256d __a) -{ - struct __storeu_pd { - __m256d_u __v; - } __attribute__((__packed__, __may_alias__)); - ((struct __storeu_pd*)__p)->__v = __a; -} - -/// Stores single-precision floating point values from a 256-bit vector -/// of [8 x float] to an unaligned memory location pointed to by \a __p. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVUPS instruction. -/// -/// \param __p -/// A pointer to a memory location that will receive the float values. -/// \param __a -/// A 256-bit vector of [8 x float] containing the values to be moved. -static __inline void __DEFAULT_FN_ATTRS -_mm256_storeu_ps(float *__p, __m256 __a) -{ - struct __storeu_ps { - __m256_u __v; - } __attribute__((__packed__, __may_alias__)); - ((struct __storeu_ps*)__p)->__v = __a; -} - -/// Stores integer values from a 256-bit integer vector to a 32-byte -/// aligned memory location pointed to by \a __p. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVDQA instruction. -/// -/// \param __p -/// A 32-byte aligned pointer to a memory location that will receive the -/// integer values. -/// \param __a -/// A 256-bit integer vector containing the values to be moved. -static __inline void __DEFAULT_FN_ATTRS -_mm256_store_si256(__m256i *__p, __m256i __a) -{ - *__p = __a; -} - -/// Stores integer values from a 256-bit integer vector to an unaligned -/// memory location pointed to by \a __p. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVDQU instruction. -/// -/// \param __p -/// A pointer to a memory location that will receive the integer values. -/// \param __a -/// A 256-bit integer vector containing the values to be moved. -static __inline void __DEFAULT_FN_ATTRS -_mm256_storeu_si256(__m256i_u *__p, __m256i __a) -{ - struct __storeu_si256 { - __m256i_u __v; - } __attribute__((__packed__, __may_alias__)); - ((struct __storeu_si256*)__p)->__v = __a; -} - -/* Conditional load ops */ -/// Conditionally loads double-precision floating point elements from a -/// memory location pointed to by \a __p into a 128-bit vector of -/// [2 x double], depending on the mask bits associated with each data -/// element. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMASKMOVPD instruction. -/// -/// \param __p -/// A pointer to a memory location that contains the double-precision -/// floating point values. -/// \param __m -/// A 128-bit integer vector containing the mask. The most significant bit of -/// each data element represents the mask bits. If a mask bit is zero, the -/// corresponding value in the memory location is not loaded and the -/// corresponding field in the return value is set to zero. -/// \returns A 128-bit vector of [2 x double] containing the loaded values. -static __inline __m128d __DEFAULT_FN_ATTRS128 -_mm_maskload_pd(double const *__p, __m128i __m) -{ - return (__m128d)__builtin_ia32_maskloadpd((const __v2df *)__p, (__v2di)__m); -} - -/// Conditionally loads double-precision floating point elements from a -/// memory location pointed to by \a __p into a 256-bit vector of -/// [4 x double], depending on the mask bits associated with each data -/// element. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMASKMOVPD instruction. -/// -/// \param __p -/// A pointer to a memory location that contains the double-precision -/// floating point values. -/// \param __m -/// A 256-bit integer vector of [4 x quadword] containing the mask. The most -/// significant bit of each quadword element represents the mask bits. If a -/// mask bit is zero, the corresponding value in the memory location is not -/// loaded and the corresponding field in the return value is set to zero. -/// \returns A 256-bit vector of [4 x double] containing the loaded values. -static __inline __m256d __DEFAULT_FN_ATTRS -_mm256_maskload_pd(double const *__p, __m256i __m) -{ - return (__m256d)__builtin_ia32_maskloadpd256((const __v4df *)__p, - (__v4di)__m); -} - -/// Conditionally loads single-precision floating point elements from a -/// memory location pointed to by \a __p into a 128-bit vector of -/// [4 x float], depending on the mask bits associated with each data -/// element. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMASKMOVPS instruction. -/// -/// \param __p -/// A pointer to a memory location that contains the single-precision -/// floating point values. -/// \param __m -/// A 128-bit integer vector containing the mask. The most significant bit of -/// each data element represents the mask bits. If a mask bit is zero, the -/// corresponding value in the memory location is not loaded and the -/// corresponding field in the return value is set to zero. -/// \returns A 128-bit vector of [4 x float] containing the loaded values. -static __inline __m128 __DEFAULT_FN_ATTRS128 -_mm_maskload_ps(float const *__p, __m128i __m) -{ - return (__m128)__builtin_ia32_maskloadps((const __v4sf *)__p, (__v4si)__m); -} - -/// Conditionally loads single-precision floating point elements from a -/// memory location pointed to by \a __p into a 256-bit vector of -/// [8 x float], depending on the mask bits associated with each data -/// element. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMASKMOVPS instruction. -/// -/// \param __p -/// A pointer to a memory location that contains the single-precision -/// floating point values. -/// \param __m -/// A 256-bit integer vector of [8 x dword] containing the mask. The most -/// significant bit of each dword element represents the mask bits. If a mask -/// bit is zero, the corresponding value in the memory location is not loaded -/// and the corresponding field in the return value is set to zero. -/// \returns A 256-bit vector of [8 x float] containing the loaded values. -static __inline __m256 __DEFAULT_FN_ATTRS -_mm256_maskload_ps(float const *__p, __m256i __m) -{ - return (__m256)__builtin_ia32_maskloadps256((const __v8sf *)__p, (__v8si)__m); -} - -/* Conditional store ops */ -/// Moves single-precision floating point values from a 256-bit vector -/// of [8 x float] to a memory location pointed to by \a __p, according to -/// the specified mask. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMASKMOVPS instruction. -/// -/// \param __p -/// A pointer to a memory location that will receive the float values. -/// \param __m -/// A 256-bit integer vector of [8 x dword] containing the mask. The most -/// significant bit of each dword element in the mask vector represents the -/// mask bits. If a mask bit is zero, the corresponding value from vector -/// \a __a is not stored and the corresponding field in the memory location -/// pointed to by \a __p is not changed. -/// \param __a -/// A 256-bit vector of [8 x float] containing the values to be stored. -static __inline void __DEFAULT_FN_ATTRS -_mm256_maskstore_ps(float *__p, __m256i __m, __m256 __a) -{ - __builtin_ia32_maskstoreps256((__v8sf *)__p, (__v8si)__m, (__v8sf)__a); -} - -/// Moves double-precision values from a 128-bit vector of [2 x double] -/// to a memory location pointed to by \a __p, according to the specified -/// mask. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMASKMOVPD instruction. -/// -/// \param __p -/// A pointer to a memory location that will receive the float values. -/// \param __m -/// A 128-bit integer vector containing the mask. The most significant bit of -/// each field in the mask vector represents the mask bits. If a mask bit is -/// zero, the corresponding value from vector \a __a is not stored and the -/// corresponding field in the memory location pointed to by \a __p is not -/// changed. -/// \param __a -/// A 128-bit vector of [2 x double] containing the values to be stored. -static __inline void __DEFAULT_FN_ATTRS128 -_mm_maskstore_pd(double *__p, __m128i __m, __m128d __a) -{ - __builtin_ia32_maskstorepd((__v2df *)__p, (__v2di)__m, (__v2df)__a); -} - -/// Moves double-precision values from a 256-bit vector of [4 x double] -/// to a memory location pointed to by \a __p, according to the specified -/// mask. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMASKMOVPD instruction. -/// -/// \param __p -/// A pointer to a memory location that will receive the float values. -/// \param __m -/// A 256-bit integer vector of [4 x quadword] containing the mask. The most -/// significant bit of each quadword element in the mask vector represents -/// the mask bits. If a mask bit is zero, the corresponding value from vector -/// __a is not stored and the corresponding field in the memory location -/// pointed to by \a __p is not changed. -/// \param __a -/// A 256-bit vector of [4 x double] containing the values to be stored. -static __inline void __DEFAULT_FN_ATTRS -_mm256_maskstore_pd(double *__p, __m256i __m, __m256d __a) -{ - __builtin_ia32_maskstorepd256((__v4df *)__p, (__v4di)__m, (__v4df)__a); -} - -/// Moves single-precision floating point values from a 128-bit vector -/// of [4 x float] to a memory location pointed to by \a __p, according to -/// the specified mask. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMASKMOVPS instruction. -/// -/// \param __p -/// A pointer to a memory location that will receive the float values. -/// \param __m -/// A 128-bit integer vector containing the mask. The most significant bit of -/// each field in the mask vector represents the mask bits. If a mask bit is -/// zero, the corresponding value from vector __a is not stored and the -/// corresponding field in the memory location pointed to by \a __p is not -/// changed. -/// \param __a -/// A 128-bit vector of [4 x float] containing the values to be stored. -static __inline void __DEFAULT_FN_ATTRS128 -_mm_maskstore_ps(float *__p, __m128i __m, __m128 __a) -{ - __builtin_ia32_maskstoreps((__v4sf *)__p, (__v4si)__m, (__v4sf)__a); -} - -/* Cacheability support ops */ -/// Moves integer data from a 256-bit integer vector to a 32-byte -/// aligned memory location. To minimize caching, the data is flagged as -/// non-temporal (unlikely to be used again soon). -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVNTDQ instruction. -/// -/// \param __a -/// A pointer to a 32-byte aligned memory location that will receive the -/// integer values. -/// \param __b -/// A 256-bit integer vector containing the values to be moved. -static __inline void __DEFAULT_FN_ATTRS -_mm256_stream_si256(__m256i *__a, __m256i __b) -{ - typedef __v4di __v4di_aligned __attribute__((aligned(32))); - __builtin_nontemporal_store((__v4di_aligned)__b, (__v4di_aligned*)__a); -} - -/// Moves double-precision values from a 256-bit vector of [4 x double] -/// to a 32-byte aligned memory location. To minimize caching, the data is -/// flagged as non-temporal (unlikely to be used again soon). -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVNTPD instruction. -/// -/// \param __a -/// A pointer to a 32-byte aligned memory location that will receive the -/// double-precision floating-point values. -/// \param __b -/// A 256-bit vector of [4 x double] containing the values to be moved. -static __inline void __DEFAULT_FN_ATTRS -_mm256_stream_pd(double *__a, __m256d __b) -{ - typedef __v4df __v4df_aligned __attribute__((aligned(32))); - __builtin_nontemporal_store((__v4df_aligned)__b, (__v4df_aligned*)__a); -} - -/// Moves single-precision floating point values from a 256-bit vector -/// of [8 x float] to a 32-byte aligned memory location. To minimize -/// caching, the data is flagged as non-temporal (unlikely to be used again -/// soon). -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVNTPS instruction. -/// -/// \param __p -/// A pointer to a 32-byte aligned memory location that will receive the -/// single-precision floating point values. -/// \param __a -/// A 256-bit vector of [8 x float] containing the values to be moved. -static __inline void __DEFAULT_FN_ATTRS -_mm256_stream_ps(float *__p, __m256 __a) -{ - typedef __v8sf __v8sf_aligned __attribute__((aligned(32))); - __builtin_nontemporal_store((__v8sf_aligned)__a, (__v8sf_aligned*)__p); -} - -/* Create vectors */ -/// Create a 256-bit vector of [4 x double] with undefined values. -/// -/// \headerfile -/// -/// This intrinsic has no corresponding instruction. -/// -/// \returns A 256-bit vector of [4 x double] containing undefined values. -static __inline__ __m256d __DEFAULT_FN_ATTRS -_mm256_undefined_pd(void) -{ - return (__m256d)__builtin_ia32_undef256(); -} - -/// Create a 256-bit vector of [8 x float] with undefined values. -/// -/// \headerfile -/// -/// This intrinsic has no corresponding instruction. -/// -/// \returns A 256-bit vector of [8 x float] containing undefined values. -static __inline__ __m256 __DEFAULT_FN_ATTRS -_mm256_undefined_ps(void) -{ - return (__m256)__builtin_ia32_undef256(); -} - -/// Create a 256-bit integer vector with undefined values. -/// -/// \headerfile -/// -/// This intrinsic has no corresponding instruction. -/// -/// \returns A 256-bit integer vector containing undefined values. -static __inline__ __m256i __DEFAULT_FN_ATTRS -_mm256_undefined_si256(void) -{ - return (__m256i)__builtin_ia32_undef256(); -} - -/// Constructs a 256-bit floating-point vector of [4 x double] -/// initialized with the specified double-precision floating-point values. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VUNPCKLPD+VINSERTF128 -/// instruction. -/// -/// \param __a -/// A double-precision floating-point value used to initialize bits [255:192] -/// of the result. -/// \param __b -/// A double-precision floating-point value used to initialize bits [191:128] -/// of the result. -/// \param __c -/// A double-precision floating-point value used to initialize bits [127:64] -/// of the result. -/// \param __d -/// A double-precision floating-point value used to initialize bits [63:0] -/// of the result. -/// \returns An initialized 256-bit floating-point vector of [4 x double]. -static __inline __m256d __DEFAULT_FN_ATTRS -_mm256_set_pd(double __a, double __b, double __c, double __d) -{ - return __extension__ (__m256d){ __d, __c, __b, __a }; -} - -/// Constructs a 256-bit floating-point vector of [8 x float] initialized -/// with the specified single-precision floating-point values. -/// -/// \headerfile -/// -/// This intrinsic is a utility function and does not correspond to a specific -/// instruction. -/// -/// \param __a -/// A single-precision floating-point value used to initialize bits [255:224] -/// of the result. -/// \param __b -/// A single-precision floating-point value used to initialize bits [223:192] -/// of the result. -/// \param __c -/// A single-precision floating-point value used to initialize bits [191:160] -/// of the result. -/// \param __d -/// A single-precision floating-point value used to initialize bits [159:128] -/// of the result. -/// \param __e -/// A single-precision floating-point value used to initialize bits [127:96] -/// of the result. -/// \param __f -/// A single-precision floating-point value used to initialize bits [95:64] -/// of the result. -/// \param __g -/// A single-precision floating-point value used to initialize bits [63:32] -/// of the result. -/// \param __h -/// A single-precision floating-point value used to initialize bits [31:0] -/// of the result. -/// \returns An initialized 256-bit floating-point vector of [8 x float]. -static __inline __m256 __DEFAULT_FN_ATTRS -_mm256_set_ps(float __a, float __b, float __c, float __d, - float __e, float __f, float __g, float __h) -{ - return __extension__ (__m256){ __h, __g, __f, __e, __d, __c, __b, __a }; -} - -/// Constructs a 256-bit integer vector initialized with the specified -/// 32-bit integral values. -/// -/// \headerfile -/// -/// This intrinsic is a utility function and does not correspond to a specific -/// instruction. -/// -/// \param __i0 -/// A 32-bit integral value used to initialize bits [255:224] of the result. -/// \param __i1 -/// A 32-bit integral value used to initialize bits [223:192] of the result. -/// \param __i2 -/// A 32-bit integral value used to initialize bits [191:160] of the result. -/// \param __i3 -/// A 32-bit integral value used to initialize bits [159:128] of the result. -/// \param __i4 -/// A 32-bit integral value used to initialize bits [127:96] of the result. -/// \param __i5 -/// A 32-bit integral value used to initialize bits [95:64] of the result. -/// \param __i6 -/// A 32-bit integral value used to initialize bits [63:32] of the result. -/// \param __i7 -/// A 32-bit integral value used to initialize bits [31:0] of the result. -/// \returns An initialized 256-bit integer vector. -static __inline __m256i __DEFAULT_FN_ATTRS -_mm256_set_epi32(int __i0, int __i1, int __i2, int __i3, - int __i4, int __i5, int __i6, int __i7) -{ - return __extension__ (__m256i)(__v8si){ __i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0 }; -} - -/// Constructs a 256-bit integer vector initialized with the specified -/// 16-bit integral values. -/// -/// \headerfile -/// -/// This intrinsic is a utility function and does not correspond to a specific -/// instruction. -/// -/// \param __w15 -/// A 16-bit integral value used to initialize bits [255:240] of the result. -/// \param __w14 -/// A 16-bit integral value used to initialize bits [239:224] of the result. -/// \param __w13 -/// A 16-bit integral value used to initialize bits [223:208] of the result. -/// \param __w12 -/// A 16-bit integral value used to initialize bits [207:192] of the result. -/// \param __w11 -/// A 16-bit integral value used to initialize bits [191:176] of the result. -/// \param __w10 -/// A 16-bit integral value used to initialize bits [175:160] of the result. -/// \param __w09 -/// A 16-bit integral value used to initialize bits [159:144] of the result. -/// \param __w08 -/// A 16-bit integral value used to initialize bits [143:128] of the result. -/// \param __w07 -/// A 16-bit integral value used to initialize bits [127:112] of the result. -/// \param __w06 -/// A 16-bit integral value used to initialize bits [111:96] of the result. -/// \param __w05 -/// A 16-bit integral value used to initialize bits [95:80] of the result. -/// \param __w04 -/// A 16-bit integral value used to initialize bits [79:64] of the result. -/// \param __w03 -/// A 16-bit integral value used to initialize bits [63:48] of the result. -/// \param __w02 -/// A 16-bit integral value used to initialize bits [47:32] of the result. -/// \param __w01 -/// A 16-bit integral value used to initialize bits [31:16] of the result. -/// \param __w00 -/// A 16-bit integral value used to initialize bits [15:0] of the result. -/// \returns An initialized 256-bit integer vector. -static __inline __m256i __DEFAULT_FN_ATTRS -_mm256_set_epi16(short __w15, short __w14, short __w13, short __w12, - short __w11, short __w10, short __w09, short __w08, - short __w07, short __w06, short __w05, short __w04, - short __w03, short __w02, short __w01, short __w00) -{ - return __extension__ (__m256i)(__v16hi){ __w00, __w01, __w02, __w03, __w04, __w05, __w06, - __w07, __w08, __w09, __w10, __w11, __w12, __w13, __w14, __w15 }; -} - -/// Constructs a 256-bit integer vector initialized with the specified -/// 8-bit integral values. -/// -/// \headerfile -/// -/// This intrinsic is a utility function and does not correspond to a specific -/// instruction. -/// -/// \param __b31 -/// An 8-bit integral value used to initialize bits [255:248] of the result. -/// \param __b30 -/// An 8-bit integral value used to initialize bits [247:240] of the result. -/// \param __b29 -/// An 8-bit integral value used to initialize bits [239:232] of the result. -/// \param __b28 -/// An 8-bit integral value used to initialize bits [231:224] of the result. -/// \param __b27 -/// An 8-bit integral value used to initialize bits [223:216] of the result. -/// \param __b26 -/// An 8-bit integral value used to initialize bits [215:208] of the result. -/// \param __b25 -/// An 8-bit integral value used to initialize bits [207:200] of the result. -/// \param __b24 -/// An 8-bit integral value used to initialize bits [199:192] of the result. -/// \param __b23 -/// An 8-bit integral value used to initialize bits [191:184] of the result. -/// \param __b22 -/// An 8-bit integral value used to initialize bits [183:176] of the result. -/// \param __b21 -/// An 8-bit integral value used to initialize bits [175:168] of the result. -/// \param __b20 -/// An 8-bit integral value used to initialize bits [167:160] of the result. -/// \param __b19 -/// An 8-bit integral value used to initialize bits [159:152] of the result. -/// \param __b18 -/// An 8-bit integral value used to initialize bits [151:144] of the result. -/// \param __b17 -/// An 8-bit integral value used to initialize bits [143:136] of the result. -/// \param __b16 -/// An 8-bit integral value used to initialize bits [135:128] of the result. -/// \param __b15 -/// An 8-bit integral value used to initialize bits [127:120] of the result. -/// \param __b14 -/// An 8-bit integral value used to initialize bits [119:112] of the result. -/// \param __b13 -/// An 8-bit integral value used to initialize bits [111:104] of the result. -/// \param __b12 -/// An 8-bit integral value used to initialize bits [103:96] of the result. -/// \param __b11 -/// An 8-bit integral value used to initialize bits [95:88] of the result. -/// \param __b10 -/// An 8-bit integral value used to initialize bits [87:80] of the result. -/// \param __b09 -/// An 8-bit integral value used to initialize bits [79:72] of the result. -/// \param __b08 -/// An 8-bit integral value used to initialize bits [71:64] of the result. -/// \param __b07 -/// An 8-bit integral value used to initialize bits [63:56] of the result. -/// \param __b06 -/// An 8-bit integral value used to initialize bits [55:48] of the result. -/// \param __b05 -/// An 8-bit integral value used to initialize bits [47:40] of the result. -/// \param __b04 -/// An 8-bit integral value used to initialize bits [39:32] of the result. -/// \param __b03 -/// An 8-bit integral value used to initialize bits [31:24] of the result. -/// \param __b02 -/// An 8-bit integral value used to initialize bits [23:16] of the result. -/// \param __b01 -/// An 8-bit integral value used to initialize bits [15:8] of the result. -/// \param __b00 -/// An 8-bit integral value used to initialize bits [7:0] of the result. -/// \returns An initialized 256-bit integer vector. -static __inline __m256i __DEFAULT_FN_ATTRS -_mm256_set_epi8(char __b31, char __b30, char __b29, char __b28, - char __b27, char __b26, char __b25, char __b24, - char __b23, char __b22, char __b21, char __b20, - char __b19, char __b18, char __b17, char __b16, - char __b15, char __b14, char __b13, char __b12, - char __b11, char __b10, char __b09, char __b08, - char __b07, char __b06, char __b05, char __b04, - char __b03, char __b02, char __b01, char __b00) -{ - return __extension__ (__m256i)(__v32qi){ - __b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07, - __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15, - __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23, - __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31 - }; -} - -/// Constructs a 256-bit integer vector initialized with the specified -/// 64-bit integral values. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPUNPCKLQDQ+VINSERTF128 -/// instruction. -/// -/// \param __a -/// A 64-bit integral value used to initialize bits [255:192] of the result. -/// \param __b -/// A 64-bit integral value used to initialize bits [191:128] of the result. -/// \param __c -/// A 64-bit integral value used to initialize bits [127:64] of the result. -/// \param __d -/// A 64-bit integral value used to initialize bits [63:0] of the result. -/// \returns An initialized 256-bit integer vector. -static __inline __m256i __DEFAULT_FN_ATTRS -_mm256_set_epi64x(long long __a, long long __b, long long __c, long long __d) -{ - return __extension__ (__m256i)(__v4di){ __d, __c, __b, __a }; -} - -/* Create vectors with elements in reverse order */ -/// Constructs a 256-bit floating-point vector of [4 x double], -/// initialized in reverse order with the specified double-precision -/// floating-point values. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VUNPCKLPD+VINSERTF128 -/// instruction. -/// -/// \param __a -/// A double-precision floating-point value used to initialize bits [63:0] -/// of the result. -/// \param __b -/// A double-precision floating-point value used to initialize bits [127:64] -/// of the result. -/// \param __c -/// A double-precision floating-point value used to initialize bits [191:128] -/// of the result. -/// \param __d -/// A double-precision floating-point value used to initialize bits [255:192] -/// of the result. -/// \returns An initialized 256-bit floating-point vector of [4 x double]. -static __inline __m256d __DEFAULT_FN_ATTRS -_mm256_setr_pd(double __a, double __b, double __c, double __d) -{ - return _mm256_set_pd(__d, __c, __b, __a); -} - -/// Constructs a 256-bit floating-point vector of [8 x float], -/// initialized in reverse order with the specified single-precision -/// float-point values. -/// -/// \headerfile -/// -/// This intrinsic is a utility function and does not correspond to a specific -/// instruction. -/// -/// \param __a -/// A single-precision floating-point value used to initialize bits [31:0] -/// of the result. -/// \param __b -/// A single-precision floating-point value used to initialize bits [63:32] -/// of the result. -/// \param __c -/// A single-precision floating-point value used to initialize bits [95:64] -/// of the result. -/// \param __d -/// A single-precision floating-point value used to initialize bits [127:96] -/// of the result. -/// \param __e -/// A single-precision floating-point value used to initialize bits [159:128] -/// of the result. -/// \param __f -/// A single-precision floating-point value used to initialize bits [191:160] -/// of the result. -/// \param __g -/// A single-precision floating-point value used to initialize bits [223:192] -/// of the result. -/// \param __h -/// A single-precision floating-point value used to initialize bits [255:224] -/// of the result. -/// \returns An initialized 256-bit floating-point vector of [8 x float]. -static __inline __m256 __DEFAULT_FN_ATTRS -_mm256_setr_ps(float __a, float __b, float __c, float __d, - float __e, float __f, float __g, float __h) -{ - return _mm256_set_ps(__h, __g, __f, __e, __d, __c, __b, __a); -} - -/// Constructs a 256-bit integer vector, initialized in reverse order -/// with the specified 32-bit integral values. -/// -/// \headerfile -/// -/// This intrinsic is a utility function and does not correspond to a specific -/// instruction. -/// -/// \param __i0 -/// A 32-bit integral value used to initialize bits [31:0] of the result. -/// \param __i1 -/// A 32-bit integral value used to initialize bits [63:32] of the result. -/// \param __i2 -/// A 32-bit integral value used to initialize bits [95:64] of the result. -/// \param __i3 -/// A 32-bit integral value used to initialize bits [127:96] of the result. -/// \param __i4 -/// A 32-bit integral value used to initialize bits [159:128] of the result. -/// \param __i5 -/// A 32-bit integral value used to initialize bits [191:160] of the result. -/// \param __i6 -/// A 32-bit integral value used to initialize bits [223:192] of the result. -/// \param __i7 -/// A 32-bit integral value used to initialize bits [255:224] of the result. -/// \returns An initialized 256-bit integer vector. -static __inline __m256i __DEFAULT_FN_ATTRS -_mm256_setr_epi32(int __i0, int __i1, int __i2, int __i3, - int __i4, int __i5, int __i6, int __i7) -{ - return _mm256_set_epi32(__i7, __i6, __i5, __i4, __i3, __i2, __i1, __i0); -} - -/// Constructs a 256-bit integer vector, initialized in reverse order -/// with the specified 16-bit integral values. -/// -/// \headerfile -/// -/// This intrinsic is a utility function and does not correspond to a specific -/// instruction. -/// -/// \param __w15 -/// A 16-bit integral value used to initialize bits [15:0] of the result. -/// \param __w14 -/// A 16-bit integral value used to initialize bits [31:16] of the result. -/// \param __w13 -/// A 16-bit integral value used to initialize bits [47:32] of the result. -/// \param __w12 -/// A 16-bit integral value used to initialize bits [63:48] of the result. -/// \param __w11 -/// A 16-bit integral value used to initialize bits [79:64] of the result. -/// \param __w10 -/// A 16-bit integral value used to initialize bits [95:80] of the result. -/// \param __w09 -/// A 16-bit integral value used to initialize bits [111:96] of the result. -/// \param __w08 -/// A 16-bit integral value used to initialize bits [127:112] of the result. -/// \param __w07 -/// A 16-bit integral value used to initialize bits [143:128] of the result. -/// \param __w06 -/// A 16-bit integral value used to initialize bits [159:144] of the result. -/// \param __w05 -/// A 16-bit integral value used to initialize bits [175:160] of the result. -/// \param __w04 -/// A 16-bit integral value used to initialize bits [191:176] of the result. -/// \param __w03 -/// A 16-bit integral value used to initialize bits [207:192] of the result. -/// \param __w02 -/// A 16-bit integral value used to initialize bits [223:208] of the result. -/// \param __w01 -/// A 16-bit integral value used to initialize bits [239:224] of the result. -/// \param __w00 -/// A 16-bit integral value used to initialize bits [255:240] of the result. -/// \returns An initialized 256-bit integer vector. -static __inline __m256i __DEFAULT_FN_ATTRS -_mm256_setr_epi16(short __w15, short __w14, short __w13, short __w12, - short __w11, short __w10, short __w09, short __w08, - short __w07, short __w06, short __w05, short __w04, - short __w03, short __w02, short __w01, short __w00) -{ - return _mm256_set_epi16(__w00, __w01, __w02, __w03, - __w04, __w05, __w06, __w07, - __w08, __w09, __w10, __w11, - __w12, __w13, __w14, __w15); -} - -/// Constructs a 256-bit integer vector, initialized in reverse order -/// with the specified 8-bit integral values. -/// -/// \headerfile -/// -/// This intrinsic is a utility function and does not correspond to a specific -/// instruction. -/// -/// \param __b31 -/// An 8-bit integral value used to initialize bits [7:0] of the result. -/// \param __b30 -/// An 8-bit integral value used to initialize bits [15:8] of the result. -/// \param __b29 -/// An 8-bit integral value used to initialize bits [23:16] of the result. -/// \param __b28 -/// An 8-bit integral value used to initialize bits [31:24] of the result. -/// \param __b27 -/// An 8-bit integral value used to initialize bits [39:32] of the result. -/// \param __b26 -/// An 8-bit integral value used to initialize bits [47:40] of the result. -/// \param __b25 -/// An 8-bit integral value used to initialize bits [55:48] of the result. -/// \param __b24 -/// An 8-bit integral value used to initialize bits [63:56] of the result. -/// \param __b23 -/// An 8-bit integral value used to initialize bits [71:64] of the result. -/// \param __b22 -/// An 8-bit integral value used to initialize bits [79:72] of the result. -/// \param __b21 -/// An 8-bit integral value used to initialize bits [87:80] of the result. -/// \param __b20 -/// An 8-bit integral value used to initialize bits [95:88] of the result. -/// \param __b19 -/// An 8-bit integral value used to initialize bits [103:96] of the result. -/// \param __b18 -/// An 8-bit integral value used to initialize bits [111:104] of the result. -/// \param __b17 -/// An 8-bit integral value used to initialize bits [119:112] of the result. -/// \param __b16 -/// An 8-bit integral value used to initialize bits [127:120] of the result. -/// \param __b15 -/// An 8-bit integral value used to initialize bits [135:128] of the result. -/// \param __b14 -/// An 8-bit integral value used to initialize bits [143:136] of the result. -/// \param __b13 -/// An 8-bit integral value used to initialize bits [151:144] of the result. -/// \param __b12 -/// An 8-bit integral value used to initialize bits [159:152] of the result. -/// \param __b11 -/// An 8-bit integral value used to initialize bits [167:160] of the result. -/// \param __b10 -/// An 8-bit integral value used to initialize bits [175:168] of the result. -/// \param __b09 -/// An 8-bit integral value used to initialize bits [183:176] of the result. -/// \param __b08 -/// An 8-bit integral value used to initialize bits [191:184] of the result. -/// \param __b07 -/// An 8-bit integral value used to initialize bits [199:192] of the result. -/// \param __b06 -/// An 8-bit integral value used to initialize bits [207:200] of the result. -/// \param __b05 -/// An 8-bit integral value used to initialize bits [215:208] of the result. -/// \param __b04 -/// An 8-bit integral value used to initialize bits [223:216] of the result. -/// \param __b03 -/// An 8-bit integral value used to initialize bits [231:224] of the result. -/// \param __b02 -/// An 8-bit integral value used to initialize bits [239:232] of the result. -/// \param __b01 -/// An 8-bit integral value used to initialize bits [247:240] of the result. -/// \param __b00 -/// An 8-bit integral value used to initialize bits [255:248] of the result. -/// \returns An initialized 256-bit integer vector. -static __inline __m256i __DEFAULT_FN_ATTRS -_mm256_setr_epi8(char __b31, char __b30, char __b29, char __b28, - char __b27, char __b26, char __b25, char __b24, - char __b23, char __b22, char __b21, char __b20, - char __b19, char __b18, char __b17, char __b16, - char __b15, char __b14, char __b13, char __b12, - char __b11, char __b10, char __b09, char __b08, - char __b07, char __b06, char __b05, char __b04, - char __b03, char __b02, char __b01, char __b00) -{ - return _mm256_set_epi8(__b00, __b01, __b02, __b03, __b04, __b05, __b06, __b07, - __b08, __b09, __b10, __b11, __b12, __b13, __b14, __b15, - __b16, __b17, __b18, __b19, __b20, __b21, __b22, __b23, - __b24, __b25, __b26, __b27, __b28, __b29, __b30, __b31); -} - -/// Constructs a 256-bit integer vector, initialized in reverse order -/// with the specified 64-bit integral values. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPUNPCKLQDQ+VINSERTF128 -/// instruction. -/// -/// \param __a -/// A 64-bit integral value used to initialize bits [63:0] of the result. -/// \param __b -/// A 64-bit integral value used to initialize bits [127:64] of the result. -/// \param __c -/// A 64-bit integral value used to initialize bits [191:128] of the result. -/// \param __d -/// A 64-bit integral value used to initialize bits [255:192] of the result. -/// \returns An initialized 256-bit integer vector. -static __inline __m256i __DEFAULT_FN_ATTRS -_mm256_setr_epi64x(long long __a, long long __b, long long __c, long long __d) -{ - return _mm256_set_epi64x(__d, __c, __b, __a); -} - -/* Create vectors with repeated elements */ -/// Constructs a 256-bit floating-point vector of [4 x double], with each -/// of the four double-precision floating-point vector elements set to the -/// specified double-precision floating-point value. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVDDUP+VINSERTF128 instruction. -/// -/// \param __w -/// A double-precision floating-point value used to initialize each vector -/// element of the result. -/// \returns An initialized 256-bit floating-point vector of [4 x double]. -static __inline __m256d __DEFAULT_FN_ATTRS -_mm256_set1_pd(double __w) -{ - return _mm256_set_pd(__w, __w, __w, __w); -} - -/// Constructs a 256-bit floating-point vector of [8 x float], with each -/// of the eight single-precision floating-point vector elements set to the -/// specified single-precision floating-point value. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPERMILPS+VINSERTF128 -/// instruction. -/// -/// \param __w -/// A single-precision floating-point value used to initialize each vector -/// element of the result. -/// \returns An initialized 256-bit floating-point vector of [8 x float]. -static __inline __m256 __DEFAULT_FN_ATTRS -_mm256_set1_ps(float __w) -{ - return _mm256_set_ps(__w, __w, __w, __w, __w, __w, __w, __w); -} - -/// Constructs a 256-bit integer vector of [8 x i32], with each of the -/// 32-bit integral vector elements set to the specified 32-bit integral -/// value. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPERMILPS+VINSERTF128 -/// instruction. -/// -/// \param __i -/// A 32-bit integral value used to initialize each vector element of the -/// result. -/// \returns An initialized 256-bit integer vector of [8 x i32]. -static __inline __m256i __DEFAULT_FN_ATTRS -_mm256_set1_epi32(int __i) -{ - return _mm256_set_epi32(__i, __i, __i, __i, __i, __i, __i, __i); -} - -/// Constructs a 256-bit integer vector of [16 x i16], with each of the -/// 16-bit integral vector elements set to the specified 16-bit integral -/// value. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPSHUFB+VINSERTF128 instruction. -/// -/// \param __w -/// A 16-bit integral value used to initialize each vector element of the -/// result. -/// \returns An initialized 256-bit integer vector of [16 x i16]. -static __inline __m256i __DEFAULT_FN_ATTRS -_mm256_set1_epi16(short __w) -{ - return _mm256_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w, - __w, __w, __w, __w, __w, __w, __w, __w); -} - -/// Constructs a 256-bit integer vector of [32 x i8], with each of the -/// 8-bit integral vector elements set to the specified 8-bit integral value. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPSHUFB+VINSERTF128 instruction. -/// -/// \param __b -/// An 8-bit integral value used to initialize each vector element of the -/// result. -/// \returns An initialized 256-bit integer vector of [32 x i8]. -static __inline __m256i __DEFAULT_FN_ATTRS -_mm256_set1_epi8(char __b) -{ - return _mm256_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b, - __b, __b, __b, __b, __b, __b, __b, __b, - __b, __b, __b, __b, __b, __b, __b, __b, - __b, __b, __b, __b, __b, __b, __b, __b); -} - -/// Constructs a 256-bit integer vector of [4 x i64], with each of the -/// 64-bit integral vector elements set to the specified 64-bit integral -/// value. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVDDUP+VINSERTF128 instruction. -/// -/// \param __q -/// A 64-bit integral value used to initialize each vector element of the -/// result. -/// \returns An initialized 256-bit integer vector of [4 x i64]. -static __inline __m256i __DEFAULT_FN_ATTRS -_mm256_set1_epi64x(long long __q) -{ - return _mm256_set_epi64x(__q, __q, __q, __q); -} - -/* Create __zeroed vectors */ -/// Constructs a 256-bit floating-point vector of [4 x double] with all -/// vector elements initialized to zero. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VXORPS instruction. -/// -/// \returns A 256-bit vector of [4 x double] with all elements set to zero. -static __inline __m256d __DEFAULT_FN_ATTRS -_mm256_setzero_pd(void) -{ - return __extension__ (__m256d){ 0, 0, 0, 0 }; -} - -/// Constructs a 256-bit floating-point vector of [8 x float] with all -/// vector elements initialized to zero. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VXORPS instruction. -/// -/// \returns A 256-bit vector of [8 x float] with all elements set to zero. -static __inline __m256 __DEFAULT_FN_ATTRS -_mm256_setzero_ps(void) -{ - return __extension__ (__m256){ 0, 0, 0, 0, 0, 0, 0, 0 }; -} - -/// Constructs a 256-bit integer vector initialized to zero. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VXORPS instruction. -/// -/// \returns A 256-bit integer vector initialized to zero. -static __inline __m256i __DEFAULT_FN_ATTRS -_mm256_setzero_si256(void) -{ - return __extension__ (__m256i)(__v4di){ 0, 0, 0, 0 }; -} - -/* Cast between vector types */ -/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit -/// floating-point vector of [8 x float]. -/// -/// \headerfile -/// -/// This intrinsic has no corresponding instruction. -/// -/// \param __a -/// A 256-bit floating-point vector of [4 x double]. -/// \returns A 256-bit floating-point vector of [8 x float] containing the same -/// bitwise pattern as the parameter. -static __inline __m256 __DEFAULT_FN_ATTRS -_mm256_castpd_ps(__m256d __a) -{ - return (__m256)__a; -} - -/// Casts a 256-bit floating-point vector of [4 x double] into a 256-bit -/// integer vector. -/// -/// \headerfile -/// -/// This intrinsic has no corresponding instruction. -/// -/// \param __a -/// A 256-bit floating-point vector of [4 x double]. -/// \returns A 256-bit integer vector containing the same bitwise pattern as the -/// parameter. -static __inline __m256i __DEFAULT_FN_ATTRS -_mm256_castpd_si256(__m256d __a) -{ - return (__m256i)__a; -} - -/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit -/// floating-point vector of [4 x double]. -/// -/// \headerfile -/// -/// This intrinsic has no corresponding instruction. -/// -/// \param __a -/// A 256-bit floating-point vector of [8 x float]. -/// \returns A 256-bit floating-point vector of [4 x double] containing the same -/// bitwise pattern as the parameter. -static __inline __m256d __DEFAULT_FN_ATTRS -_mm256_castps_pd(__m256 __a) -{ - return (__m256d)__a; -} - -/// Casts a 256-bit floating-point vector of [8 x float] into a 256-bit -/// integer vector. -/// -/// \headerfile -/// -/// This intrinsic has no corresponding instruction. -/// -/// \param __a -/// A 256-bit floating-point vector of [8 x float]. -/// \returns A 256-bit integer vector containing the same bitwise pattern as the -/// parameter. -static __inline __m256i __DEFAULT_FN_ATTRS -_mm256_castps_si256(__m256 __a) -{ - return (__m256i)__a; -} - -/// Casts a 256-bit integer vector into a 256-bit floating-point vector -/// of [8 x float]. -/// -/// \headerfile -/// -/// This intrinsic has no corresponding instruction. -/// -/// \param __a -/// A 256-bit integer vector. -/// \returns A 256-bit floating-point vector of [8 x float] containing the same -/// bitwise pattern as the parameter. -static __inline __m256 __DEFAULT_FN_ATTRS -_mm256_castsi256_ps(__m256i __a) -{ - return (__m256)__a; -} - -/// Casts a 256-bit integer vector into a 256-bit floating-point vector -/// of [4 x double]. -/// -/// \headerfile -/// -/// This intrinsic has no corresponding instruction. -/// -/// \param __a -/// A 256-bit integer vector. -/// \returns A 256-bit floating-point vector of [4 x double] containing the same -/// bitwise pattern as the parameter. -static __inline __m256d __DEFAULT_FN_ATTRS -_mm256_castsi256_pd(__m256i __a) -{ - return (__m256d)__a; -} - -/// Returns the lower 128 bits of a 256-bit floating-point vector of -/// [4 x double] as a 128-bit floating-point vector of [2 x double]. -/// -/// \headerfile -/// -/// This intrinsic has no corresponding instruction. -/// -/// \param __a -/// A 256-bit floating-point vector of [4 x double]. -/// \returns A 128-bit floating-point vector of [2 x double] containing the -/// lower 128 bits of the parameter. -static __inline __m128d __DEFAULT_FN_ATTRS -_mm256_castpd256_pd128(__m256d __a) -{ - return __builtin_shufflevector((__v4df)__a, (__v4df)__a, 0, 1); -} - -/// Returns the lower 128 bits of a 256-bit floating-point vector of -/// [8 x float] as a 128-bit floating-point vector of [4 x float]. -/// -/// \headerfile -/// -/// This intrinsic has no corresponding instruction. -/// -/// \param __a -/// A 256-bit floating-point vector of [8 x float]. -/// \returns A 128-bit floating-point vector of [4 x float] containing the -/// lower 128 bits of the parameter. -static __inline __m128 __DEFAULT_FN_ATTRS -_mm256_castps256_ps128(__m256 __a) -{ - return __builtin_shufflevector((__v8sf)__a, (__v8sf)__a, 0, 1, 2, 3); -} - -/// Truncates a 256-bit integer vector into a 128-bit integer vector. -/// -/// \headerfile -/// -/// This intrinsic has no corresponding instruction. -/// -/// \param __a -/// A 256-bit integer vector. -/// \returns A 128-bit integer vector containing the lower 128 bits of the -/// parameter. -static __inline __m128i __DEFAULT_FN_ATTRS -_mm256_castsi256_si128(__m256i __a) -{ - return __builtin_shufflevector((__v4di)__a, (__v4di)__a, 0, 1); -} - -/// Constructs a 256-bit floating-point vector of [4 x double] from a -/// 128-bit floating-point vector of [2 x double]. -/// -/// The lower 128 bits contain the value of the source vector. The contents -/// of the upper 128 bits are undefined. -/// -/// \headerfile -/// -/// This intrinsic has no corresponding instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. -/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits -/// contain the value of the parameter. The contents of the upper 128 bits -/// are undefined. -static __inline __m256d __DEFAULT_FN_ATTRS -_mm256_castpd128_pd256(__m128d __a) -{ - return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 1, -1, -1); -} - -/// Constructs a 256-bit floating-point vector of [8 x float] from a -/// 128-bit floating-point vector of [4 x float]. -/// -/// The lower 128 bits contain the value of the source vector. The contents -/// of the upper 128 bits are undefined. -/// -/// \headerfile -/// -/// This intrinsic has no corresponding instruction. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. -/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits -/// contain the value of the parameter. The contents of the upper 128 bits -/// are undefined. -static __inline __m256 __DEFAULT_FN_ATTRS -_mm256_castps128_ps256(__m128 __a) -{ - return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1, 2, 3, -1, -1, -1, -1); -} - -/// Constructs a 256-bit integer vector from a 128-bit integer vector. -/// -/// The lower 128 bits contain the value of the source vector. The contents -/// of the upper 128 bits are undefined. -/// -/// \headerfile -/// -/// This intrinsic has no corresponding instruction. -/// -/// \param __a -/// A 128-bit integer vector. -/// \returns A 256-bit integer vector. The lower 128 bits contain the value of -/// the parameter. The contents of the upper 128 bits are undefined. -static __inline __m256i __DEFAULT_FN_ATTRS -_mm256_castsi128_si256(__m128i __a) -{ - return __builtin_shufflevector((__v2di)__a, (__v2di)__a, 0, 1, -1, -1); -} - -/// Constructs a 256-bit floating-point vector of [4 x double] from a -/// 128-bit floating-point vector of [2 x double]. The lower 128 bits -/// contain the value of the source vector. The upper 128 bits are set -/// to zero. -/// -/// \headerfile -/// -/// This intrinsic has no corresponding instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. -/// \returns A 256-bit floating-point vector of [4 x double]. The lower 128 bits -/// contain the value of the parameter. The upper 128 bits are set to zero. -static __inline __m256d __DEFAULT_FN_ATTRS -_mm256_zextpd128_pd256(__m128d __a) -{ - return __builtin_shufflevector((__v2df)__a, (__v2df)_mm_setzero_pd(), 0, 1, 2, 3); -} - -/// Constructs a 256-bit floating-point vector of [8 x float] from a -/// 128-bit floating-point vector of [4 x float]. The lower 128 bits contain -/// the value of the source vector. The upper 128 bits are set to zero. -/// -/// \headerfile -/// -/// This intrinsic has no corresponding instruction. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. -/// \returns A 256-bit floating-point vector of [8 x float]. The lower 128 bits -/// contain the value of the parameter. The upper 128 bits are set to zero. -static __inline __m256 __DEFAULT_FN_ATTRS -_mm256_zextps128_ps256(__m128 __a) -{ - return __builtin_shufflevector((__v4sf)__a, (__v4sf)_mm_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7); -} - -/// Constructs a 256-bit integer vector from a 128-bit integer vector. -/// The lower 128 bits contain the value of the source vector. The upper -/// 128 bits are set to zero. -/// -/// \headerfile -/// -/// This intrinsic has no corresponding instruction. -/// -/// \param __a -/// A 128-bit integer vector. -/// \returns A 256-bit integer vector. The lower 128 bits contain the value of -/// the parameter. The upper 128 bits are set to zero. -static __inline __m256i __DEFAULT_FN_ATTRS -_mm256_zextsi128_si256(__m128i __a) -{ - return __builtin_shufflevector((__v2di)__a, (__v2di)_mm_setzero_si128(), 0, 1, 2, 3); -} - -/* - Vector insert. - We use macros rather than inlines because we only want to accept - invocations where the immediate M is a constant expression. -*/ -/// Constructs a new 256-bit vector of [8 x float] by first duplicating -/// a 256-bit vector of [8 x float] given in the first parameter, and then -/// replacing either the upper or the lower 128 bits with the contents of a -/// 128-bit vector of [4 x float] in the second parameter. -/// -/// The immediate integer parameter determines between the upper or the lower -/// 128 bits. -/// -/// \headerfile -/// -/// \code -/// __m256 _mm256_insertf128_ps(__m256 V1, __m128 V2, const int M); -/// \endcode -/// -/// This intrinsic corresponds to the VINSERTF128 instruction. -/// -/// \param V1 -/// A 256-bit vector of [8 x float]. This vector is copied to the result -/// first, and then either the upper or the lower 128 bits of the result will -/// be replaced by the contents of \a V2. -/// \param V2 -/// A 128-bit vector of [4 x float]. The contents of this parameter are -/// written to either the upper or the lower 128 bits of the result depending -/// on the value of parameter \a M. -/// \param M -/// An immediate integer. The least significant bit determines how the values -/// from the two parameters are interleaved: \n -/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result, -/// and bits [255:128] of \a V1 are copied to bits [255:128] of the -/// result. \n -/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the -/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the -/// result. -/// \returns A 256-bit vector of [8 x float] containing the interleaved values. -#define _mm256_insertf128_ps(V1, V2, M) \ - ((__m256)__builtin_ia32_vinsertf128_ps256((__v8sf)(__m256)(V1), \ - (__v4sf)(__m128)(V2), (int)(M))) - -/// Constructs a new 256-bit vector of [4 x double] by first duplicating -/// a 256-bit vector of [4 x double] given in the first parameter, and then -/// replacing either the upper or the lower 128 bits with the contents of a -/// 128-bit vector of [2 x double] in the second parameter. -/// -/// The immediate integer parameter determines between the upper or the lower -/// 128 bits. -/// -/// \headerfile -/// -/// \code -/// __m256d _mm256_insertf128_pd(__m256d V1, __m128d V2, const int M); -/// \endcode -/// -/// This intrinsic corresponds to the VINSERTF128 instruction. -/// -/// \param V1 -/// A 256-bit vector of [4 x double]. This vector is copied to the result -/// first, and then either the upper or the lower 128 bits of the result will -/// be replaced by the contents of \a V2. -/// \param V2 -/// A 128-bit vector of [2 x double]. The contents of this parameter are -/// written to either the upper or the lower 128 bits of the result depending -/// on the value of parameter \a M. -/// \param M -/// An immediate integer. The least significant bit determines how the values -/// from the two parameters are interleaved: \n -/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result, -/// and bits [255:128] of \a V1 are copied to bits [255:128] of the -/// result. \n -/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the -/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the -/// result. -/// \returns A 256-bit vector of [4 x double] containing the interleaved values. -#define _mm256_insertf128_pd(V1, V2, M) \ - ((__m256d)__builtin_ia32_vinsertf128_pd256((__v4df)(__m256d)(V1), \ - (__v2df)(__m128d)(V2), (int)(M))) - -/// Constructs a new 256-bit integer vector by first duplicating a -/// 256-bit integer vector given in the first parameter, and then replacing -/// either the upper or the lower 128 bits with the contents of a 128-bit -/// integer vector in the second parameter. -/// -/// The immediate integer parameter determines between the upper or the lower -/// 128 bits. -/// -/// \headerfile -/// -/// \code -/// __m256i _mm256_insertf128_si256(__m256i V1, __m128i V2, const int M); -/// \endcode -/// -/// This intrinsic corresponds to the VINSERTF128 instruction. -/// -/// \param V1 -/// A 256-bit integer vector. This vector is copied to the result first, and -/// then either the upper or the lower 128 bits of the result will be -/// replaced by the contents of \a V2. -/// \param V2 -/// A 128-bit integer vector. The contents of this parameter are written to -/// either the upper or the lower 128 bits of the result depending on the -/// value of parameter \a M. -/// \param M -/// An immediate integer. The least significant bit determines how the values -/// from the two parameters are interleaved: \n -/// If bit [0] of \a M is 0, \a V2 are copied to bits [127:0] of the result, -/// and bits [255:128] of \a V1 are copied to bits [255:128] of the -/// result. \n -/// If bit [0] of \a M is 1, \a V2 are copied to bits [255:128] of the -/// result, and bits [127:0] of \a V1 are copied to bits [127:0] of the -/// result. -/// \returns A 256-bit integer vector containing the interleaved values. -#define _mm256_insertf128_si256(V1, V2, M) \ - ((__m256i)__builtin_ia32_vinsertf128_si256((__v8si)(__m256i)(V1), \ - (__v4si)(__m128i)(V2), (int)(M))) - -/* - Vector extract. - We use macros rather than inlines because we only want to accept - invocations where the immediate M is a constant expression. -*/ -/// Extracts either the upper or the lower 128 bits from a 256-bit vector -/// of [8 x float], as determined by the immediate integer parameter, and -/// returns the extracted bits as a 128-bit vector of [4 x float]. -/// -/// \headerfile -/// -/// \code -/// __m128 _mm256_extractf128_ps(__m256 V, const int M); -/// \endcode -/// -/// This intrinsic corresponds to the VEXTRACTF128 instruction. -/// -/// \param V -/// A 256-bit vector of [8 x float]. -/// \param M -/// An immediate integer. The least significant bit determines which bits are -/// extracted from the first parameter: \n -/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the -/// result. \n -/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result. -/// \returns A 128-bit vector of [4 x float] containing the extracted bits. -#define _mm256_extractf128_ps(V, M) \ - ((__m128)__builtin_ia32_vextractf128_ps256((__v8sf)(__m256)(V), (int)(M))) - -/// Extracts either the upper or the lower 128 bits from a 256-bit vector -/// of [4 x double], as determined by the immediate integer parameter, and -/// returns the extracted bits as a 128-bit vector of [2 x double]. -/// -/// \headerfile -/// -/// \code -/// __m128d _mm256_extractf128_pd(__m256d V, const int M); -/// \endcode -/// -/// This intrinsic corresponds to the VEXTRACTF128 instruction. -/// -/// \param V -/// A 256-bit vector of [4 x double]. -/// \param M -/// An immediate integer. The least significant bit determines which bits are -/// extracted from the first parameter: \n -/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the -/// result. \n -/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result. -/// \returns A 128-bit vector of [2 x double] containing the extracted bits. -#define _mm256_extractf128_pd(V, M) \ - ((__m128d)__builtin_ia32_vextractf128_pd256((__v4df)(__m256d)(V), (int)(M))) - -/// Extracts either the upper or the lower 128 bits from a 256-bit -/// integer vector, as determined by the immediate integer parameter, and -/// returns the extracted bits as a 128-bit integer vector. -/// -/// \headerfile -/// -/// \code -/// __m128i _mm256_extractf128_si256(__m256i V, const int M); -/// \endcode -/// -/// This intrinsic corresponds to the VEXTRACTF128 instruction. -/// -/// \param V -/// A 256-bit integer vector. -/// \param M -/// An immediate integer. The least significant bit determines which bits are -/// extracted from the first parameter: \n -/// If bit [0] of \a M is 0, bits [127:0] of \a V are copied to the -/// result. \n -/// If bit [0] of \a M is 1, bits [255:128] of \a V are copied to the result. -/// \returns A 128-bit integer vector containing the extracted bits. -#define _mm256_extractf128_si256(V, M) \ - ((__m128i)__builtin_ia32_vextractf128_si256((__v8si)(__m256i)(V), (int)(M))) - -/// Constructs a 256-bit floating-point vector of [8 x float] by -/// concatenating two 128-bit floating-point vectors of [4 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VINSERTF128 instruction. -/// -/// \param __hi -/// A 128-bit floating-point vector of [4 x float] to be copied to the upper -/// 128 bits of the result. -/// \param __lo -/// A 128-bit floating-point vector of [4 x float] to be copied to the lower -/// 128 bits of the result. -/// \returns A 256-bit floating-point vector of [8 x float] containing the -/// concatenated result. -static __inline __m256 __DEFAULT_FN_ATTRS -_mm256_set_m128 (__m128 __hi, __m128 __lo) -{ - return (__m256) __builtin_shufflevector((__v4sf)__lo, (__v4sf)__hi, 0, 1, 2, 3, 4, 5, 6, 7); -} - -/// Constructs a 256-bit floating-point vector of [4 x double] by -/// concatenating two 128-bit floating-point vectors of [2 x double]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VINSERTF128 instruction. -/// -/// \param __hi -/// A 128-bit floating-point vector of [2 x double] to be copied to the upper -/// 128 bits of the result. -/// \param __lo -/// A 128-bit floating-point vector of [2 x double] to be copied to the lower -/// 128 bits of the result. -/// \returns A 256-bit floating-point vector of [4 x double] containing the -/// concatenated result. -static __inline __m256d __DEFAULT_FN_ATTRS -_mm256_set_m128d (__m128d __hi, __m128d __lo) -{ - return (__m256d) __builtin_shufflevector((__v2df)__lo, (__v2df)__hi, 0, 1, 2, 3); -} - -/// Constructs a 256-bit integer vector by concatenating two 128-bit -/// integer vectors. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VINSERTF128 instruction. -/// -/// \param __hi -/// A 128-bit integer vector to be copied to the upper 128 bits of the -/// result. -/// \param __lo -/// A 128-bit integer vector to be copied to the lower 128 bits of the -/// result. -/// \returns A 256-bit integer vector containing the concatenated result. -static __inline __m256i __DEFAULT_FN_ATTRS -_mm256_set_m128i (__m128i __hi, __m128i __lo) -{ - return (__m256i) __builtin_shufflevector((__v2di)__lo, (__v2di)__hi, 0, 1, 2, 3); -} - -/// Constructs a 256-bit floating-point vector of [8 x float] by -/// concatenating two 128-bit floating-point vectors of [4 x float]. This is -/// similar to _mm256_set_m128, but the order of the input parameters is -/// swapped. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VINSERTF128 instruction. -/// -/// \param __lo -/// A 128-bit floating-point vector of [4 x float] to be copied to the lower -/// 128 bits of the result. -/// \param __hi -/// A 128-bit floating-point vector of [4 x float] to be copied to the upper -/// 128 bits of the result. -/// \returns A 256-bit floating-point vector of [8 x float] containing the -/// concatenated result. -static __inline __m256 __DEFAULT_FN_ATTRS -_mm256_setr_m128 (__m128 __lo, __m128 __hi) -{ - return _mm256_set_m128(__hi, __lo); -} - -/// Constructs a 256-bit floating-point vector of [4 x double] by -/// concatenating two 128-bit floating-point vectors of [2 x double]. This is -/// similar to _mm256_set_m128d, but the order of the input parameters is -/// swapped. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VINSERTF128 instruction. -/// -/// \param __lo -/// A 128-bit floating-point vector of [2 x double] to be copied to the lower -/// 128 bits of the result. -/// \param __hi -/// A 128-bit floating-point vector of [2 x double] to be copied to the upper -/// 128 bits of the result. -/// \returns A 256-bit floating-point vector of [4 x double] containing the -/// concatenated result. -static __inline __m256d __DEFAULT_FN_ATTRS -_mm256_setr_m128d (__m128d __lo, __m128d __hi) -{ - return (__m256d)_mm256_set_m128d(__hi, __lo); -} - -/// Constructs a 256-bit integer vector by concatenating two 128-bit -/// integer vectors. This is similar to _mm256_set_m128i, but the order of -/// the input parameters is swapped. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VINSERTF128 instruction. -/// -/// \param __lo -/// A 128-bit integer vector to be copied to the lower 128 bits of the -/// result. -/// \param __hi -/// A 128-bit integer vector to be copied to the upper 128 bits of the -/// result. -/// \returns A 256-bit integer vector containing the concatenated result. -static __inline __m256i __DEFAULT_FN_ATTRS -_mm256_setr_m128i (__m128i __lo, __m128i __hi) -{ - return (__m256i)_mm256_set_m128i(__hi, __lo); -} - -/* SIMD load ops (unaligned) */ -/// Loads two 128-bit floating-point vectors of [4 x float] from -/// unaligned memory locations and constructs a 256-bit floating-point vector -/// of [8 x float] by concatenating the two 128-bit vectors. -/// -/// \headerfile -/// -/// This intrinsic corresponds to load instructions followed by the -/// VINSERTF128 instruction. -/// -/// \param __addr_hi -/// A pointer to a 128-bit memory location containing 4 consecutive -/// single-precision floating-point values. These values are to be copied to -/// bits[255:128] of the result. The address of the memory location does not -/// have to be aligned. -/// \param __addr_lo -/// A pointer to a 128-bit memory location containing 4 consecutive -/// single-precision floating-point values. These values are to be copied to -/// bits[127:0] of the result. The address of the memory location does not -/// have to be aligned. -/// \returns A 256-bit floating-point vector of [8 x float] containing the -/// concatenated result. -static __inline __m256 __DEFAULT_FN_ATTRS -_mm256_loadu2_m128(float const *__addr_hi, float const *__addr_lo) -{ - return _mm256_set_m128(_mm_loadu_ps(__addr_hi), _mm_loadu_ps(__addr_lo)); -} - -/// Loads two 128-bit floating-point vectors of [2 x double] from -/// unaligned memory locations and constructs a 256-bit floating-point vector -/// of [4 x double] by concatenating the two 128-bit vectors. -/// -/// \headerfile -/// -/// This intrinsic corresponds to load instructions followed by the -/// VINSERTF128 instruction. -/// -/// \param __addr_hi -/// A pointer to a 128-bit memory location containing two consecutive -/// double-precision floating-point values. These values are to be copied to -/// bits[255:128] of the result. The address of the memory location does not -/// have to be aligned. -/// \param __addr_lo -/// A pointer to a 128-bit memory location containing two consecutive -/// double-precision floating-point values. These values are to be copied to -/// bits[127:0] of the result. The address of the memory location does not -/// have to be aligned. -/// \returns A 256-bit floating-point vector of [4 x double] containing the -/// concatenated result. -static __inline __m256d __DEFAULT_FN_ATTRS -_mm256_loadu2_m128d(double const *__addr_hi, double const *__addr_lo) -{ - return _mm256_set_m128d(_mm_loadu_pd(__addr_hi), _mm_loadu_pd(__addr_lo)); -} - -/// Loads two 128-bit integer vectors from unaligned memory locations and -/// constructs a 256-bit integer vector by concatenating the two 128-bit -/// vectors. -/// -/// \headerfile -/// -/// This intrinsic corresponds to load instructions followed by the -/// VINSERTF128 instruction. -/// -/// \param __addr_hi -/// A pointer to a 128-bit memory location containing a 128-bit integer -/// vector. This vector is to be copied to bits[255:128] of the result. The -/// address of the memory location does not have to be aligned. -/// \param __addr_lo -/// A pointer to a 128-bit memory location containing a 128-bit integer -/// vector. This vector is to be copied to bits[127:0] of the result. The -/// address of the memory location does not have to be aligned. -/// \returns A 256-bit integer vector containing the concatenated result. -static __inline __m256i __DEFAULT_FN_ATTRS -_mm256_loadu2_m128i(__m128i_u const *__addr_hi, __m128i_u const *__addr_lo) -{ - return _mm256_set_m128i(_mm_loadu_si128(__addr_hi), _mm_loadu_si128(__addr_lo)); -} - -/* SIMD store ops (unaligned) */ -/// Stores the upper and lower 128 bits of a 256-bit floating-point -/// vector of [8 x float] into two different unaligned memory locations. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VEXTRACTF128 instruction and the -/// store instructions. -/// -/// \param __addr_hi -/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be -/// copied to this memory location. The address of this memory location does -/// not have to be aligned. -/// \param __addr_lo -/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be -/// copied to this memory location. The address of this memory location does -/// not have to be aligned. -/// \param __a -/// A 256-bit floating-point vector of [8 x float]. -static __inline void __DEFAULT_FN_ATTRS -_mm256_storeu2_m128(float *__addr_hi, float *__addr_lo, __m256 __a) -{ - __m128 __v128; - - __v128 = _mm256_castps256_ps128(__a); - _mm_storeu_ps(__addr_lo, __v128); - __v128 = _mm256_extractf128_ps(__a, 1); - _mm_storeu_ps(__addr_hi, __v128); -} - -/// Stores the upper and lower 128 bits of a 256-bit floating-point -/// vector of [4 x double] into two different unaligned memory locations. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VEXTRACTF128 instruction and the -/// store instructions. -/// -/// \param __addr_hi -/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be -/// copied to this memory location. The address of this memory location does -/// not have to be aligned. -/// \param __addr_lo -/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be -/// copied to this memory location. The address of this memory location does -/// not have to be aligned. -/// \param __a -/// A 256-bit floating-point vector of [4 x double]. -static __inline void __DEFAULT_FN_ATTRS -_mm256_storeu2_m128d(double *__addr_hi, double *__addr_lo, __m256d __a) -{ - __m128d __v128; - - __v128 = _mm256_castpd256_pd128(__a); - _mm_storeu_pd(__addr_lo, __v128); - __v128 = _mm256_extractf128_pd(__a, 1); - _mm_storeu_pd(__addr_hi, __v128); -} - -/// Stores the upper and lower 128 bits of a 256-bit integer vector into -/// two different unaligned memory locations. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VEXTRACTF128 instruction and the -/// store instructions. -/// -/// \param __addr_hi -/// A pointer to a 128-bit memory location. Bits[255:128] of \a __a are to be -/// copied to this memory location. The address of this memory location does -/// not have to be aligned. -/// \param __addr_lo -/// A pointer to a 128-bit memory location. Bits[127:0] of \a __a are to be -/// copied to this memory location. The address of this memory location does -/// not have to be aligned. -/// \param __a -/// A 256-bit integer vector. -static __inline void __DEFAULT_FN_ATTRS -_mm256_storeu2_m128i(__m128i_u *__addr_hi, __m128i_u *__addr_lo, __m256i __a) -{ - __m128i __v128; - - __v128 = _mm256_castsi256_si128(__a); - _mm_storeu_si128(__addr_lo, __v128); - __v128 = _mm256_extractf128_si256(__a, 1); - _mm_storeu_si128(__addr_hi, __v128); -} - -#undef __DEFAULT_FN_ATTRS -#undef __DEFAULT_FN_ATTRS128 - -#endif /* __AVXINTRIN_H */ diff --git a/include/avxvnniintrin.h b/include/avxvnniintrin.h deleted file mode 100644 index ad45cb7..0000000 --- a/include/avxvnniintrin.h +++ /dev/null @@ -1,225 +0,0 @@ -/*===--------------- avxvnniintrin.h - VNNI intrinsics --------------------=== - * - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - * - *===-----------------------------------------------------------------------=== - */ -#ifndef __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __AVXVNNIINTRIN_H -#define __AVXVNNIINTRIN_H - -/* Below intrinsics defined in avx512vlvnniintrin.h can be used for AVXVNNI */ -/// \fn __m256i _mm256_dpbusd_epi32(__m256i __S, __m256i __A, __m256i __B) -/// \fn __m256i _mm256_dpbusds_epi32(__m256i __S, __m256i __A, __m256i __B) -/// \fn __m256i _mm256_dpwssd_epi32(__m256i __S, __m256i __A, __m256i __B) -/// \fn __m256i _mm256_dpwssds_epi32(__m256i __S, __m256i __A, __m256i __B) -/// \fn __m128i _mm_dpbusd_epi32(__m128i __S, __m128i __A, __m128i __B) -/// \fn __m128i _mm_dpbusds_epi32(__m128i __S, __m128i __A, __m128i __B) -/// \fn __m128i _mm_dpwssd_epi32(__m128i __S, __m128i __A, __m128i __B) -/// \fn __m128i _mm_dpwssds_epi32(__m128i __S, __m128i __A, __m128i __B) - -/* Intrinsics with _avx_ prefix are for compatibility with msvc. */ -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("avxvnni"), __min_vector_width__(256))) -#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("avxvnni"), __min_vector_width__(128))) - -/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with -/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed -/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer -/// in \a __S, and store the packed 32-bit results in DST. -/// -/// This intrinsic corresponds to the VPDPBUSD instructions. -/// -/// \operation -/// FOR j := 0 to 7 -/// tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])) -/// tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])) -/// tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])) -/// tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])) -/// DST.dword[j] := __S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 -/// ENDFOR -/// DST[MAX:256] := 0 -/// \endoperation -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_dpbusd_avx_epi32(__m256i __S, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_vpdpbusd256((__v8si)__S, (__v8si)__A, (__v8si)__B); -} - -/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with -/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed -/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer -/// in \a __S using signed saturation, and store the packed 32-bit results in DST. -/// -/// This intrinsic corresponds to the VPDPBUSDS instructions. -/// -/// \operation -/// FOR j := 0 to 7 -/// tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])) -/// tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])) -/// tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])) -/// tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])) -/// DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) -/// ENDFOR -/// DST[MAX:256] := 0 -/// \endoperation -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_dpbusds_avx_epi32(__m256i __S, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_vpdpbusds256((__v8si)__S, (__v8si)__A, (__v8si)__B); -} - -/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with -/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit -/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S, -/// and store the packed 32-bit results in DST. -/// -/// This intrinsic corresponds to the VPDPWSSD instructions. -/// -/// \operation -/// FOR j := 0 to 7 -/// tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j]) -/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1]) -/// DST.dword[j] := __S.dword[j] + tmp1 + tmp2 -/// ENDFOR -/// DST[MAX:256] := 0 -/// \endoperation -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_dpwssd_avx_epi32(__m256i __S, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_vpdpwssd256((__v8si)__S, (__v8si)__A, (__v8si)__B); -} - -/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with -/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit -/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S -/// using signed saturation, and store the packed 32-bit results in DST. -/// -/// This intrinsic corresponds to the VPDPWSSDS instructions. -/// -/// \operation -/// FOR j := 0 to 7 -/// tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j]) -/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1]) -/// DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2) -/// ENDFOR -/// DST[MAX:256] := 0 -/// \endoperation -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_dpwssds_avx_epi32(__m256i __S, __m256i __A, __m256i __B) -{ - return (__m256i)__builtin_ia32_vpdpwssds256((__v8si)__S, (__v8si)__A, (__v8si)__B); -} - -/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with -/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed -/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer -/// in \a __S, and store the packed 32-bit results in DST. -/// -/// This intrinsic corresponds to the VPDPBUSD instructions. -/// -/// \operation -/// FOR j := 0 to 3 -/// tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])) -/// tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])) -/// tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])) -/// tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])) -/// DST.dword[j] := __S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4 -/// ENDFOR -/// DST[MAX:128] := 0 -/// \endoperation -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_dpbusd_avx_epi32(__m128i __S, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_vpdpbusd128((__v4si)__S, (__v4si)__A, (__v4si)__B); -} - -/// Multiply groups of 4 adjacent pairs of unsigned 8-bit integers in \a __A with -/// corresponding signed 8-bit integers in \a __B, producing 4 intermediate signed -/// 16-bit results. Sum these 4 results with the corresponding 32-bit integer -/// in \a __S using signed saturation, and store the packed 32-bit results in DST. -/// -/// This intrinsic corresponds to the VPDPBUSDS instructions. -/// -/// \operation -/// FOR j := 0 to 3 -/// tmp1.word := Signed(ZeroExtend16(__A.byte[4*j]) * SignExtend16(__B.byte[4*j])) -/// tmp2.word := Signed(ZeroExtend16(__A.byte[4*j+1]) * SignExtend16(__B.byte[4*j+1])) -/// tmp3.word := Signed(ZeroExtend16(__A.byte[4*j+2]) * SignExtend16(__B.byte[4*j+2])) -/// tmp4.word := Signed(ZeroExtend16(__A.byte[4*j+3]) * SignExtend16(__B.byte[4*j+3])) -/// DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2 + tmp3 + tmp4) -/// ENDFOR -/// DST[MAX:128] := 0 -/// \endoperation -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_dpbusds_avx_epi32(__m128i __S, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_vpdpbusds128((__v4si)__S, (__v4si)__A, (__v4si)__B); -} - -/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with -/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit -/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S, -/// and store the packed 32-bit results in DST. -/// -/// This intrinsic corresponds to the VPDPWSSD instructions. -/// -/// \operation -/// FOR j := 0 to 3 -/// tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j]) -/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1]) -/// DST.dword[j] := __S.dword[j] + tmp1 + tmp2 -/// ENDFOR -/// DST[MAX:128] := 0 -/// \endoperation -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_dpwssd_avx_epi32(__m128i __S, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_vpdpwssd128((__v4si)__S, (__v4si)__A, (__v4si)__B); -} - -/// Multiply groups of 2 adjacent pairs of signed 16-bit integers in \a __A with -/// corresponding 16-bit integers in \a __B, producing 2 intermediate signed 32-bit -/// results. Sum these 2 results with the corresponding 32-bit integer in \a __S -/// using signed saturation, and store the packed 32-bit results in DST. -/// -/// This intrinsic corresponds to the VPDPWSSDS instructions. -/// -/// \operation -/// FOR j := 0 to 3 -/// tmp1.dword := SignExtend32(__A.word[2*j]) * SignExtend32(__B.word[2*j]) -/// tmp2.dword := SignExtend32(__A.word[2*j+1]) * SignExtend32(__B.word[2*j+1]) -/// DST.dword[j] := Saturate32(__S.dword[j] + tmp1 + tmp2) -/// ENDFOR -/// DST[MAX:128] := 0 -/// \endoperation -static __inline__ __m128i __DEFAULT_FN_ATTRS128 -_mm_dpwssds_avx_epi32(__m128i __S, __m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_vpdpwssds128((__v4si)__S, (__v4si)__A, (__v4si)__B); -} - -#undef __DEFAULT_FN_ATTRS128 -#undef __DEFAULT_FN_ATTRS256 - -#endif // __AVXVNNIINTRIN_H diff --git a/include/bmi2intrin.h b/include/bmi2intrin.h deleted file mode 100644 index 0b56aed..0000000 --- a/include/bmi2intrin.h +++ /dev/null @@ -1,81 +0,0 @@ -/*===---- bmi2intrin.h - BMI2 intrinsics -----------------------------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ - -#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __BMI2INTRIN_H -#define __BMI2INTRIN_H - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("bmi2"))) - -static __inline__ unsigned int __DEFAULT_FN_ATTRS -_bzhi_u32(unsigned int __X, unsigned int __Y) -{ - return __builtin_ia32_bzhi_si(__X, __Y); -} - -static __inline__ unsigned int __DEFAULT_FN_ATTRS -_pdep_u32(unsigned int __X, unsigned int __Y) -{ - return __builtin_ia32_pdep_si(__X, __Y); -} - -static __inline__ unsigned int __DEFAULT_FN_ATTRS -_pext_u32(unsigned int __X, unsigned int __Y) -{ - return __builtin_ia32_pext_si(__X, __Y); -} - -#ifdef __x86_64__ - -static __inline__ unsigned long long __DEFAULT_FN_ATTRS -_bzhi_u64(unsigned long long __X, unsigned long long __Y) -{ - return __builtin_ia32_bzhi_di(__X, __Y); -} - -static __inline__ unsigned long long __DEFAULT_FN_ATTRS -_pdep_u64(unsigned long long __X, unsigned long long __Y) -{ - return __builtin_ia32_pdep_di(__X, __Y); -} - -static __inline__ unsigned long long __DEFAULT_FN_ATTRS -_pext_u64(unsigned long long __X, unsigned long long __Y) -{ - return __builtin_ia32_pext_di(__X, __Y); -} - -static __inline__ unsigned long long __DEFAULT_FN_ATTRS -_mulx_u64 (unsigned long long __X, unsigned long long __Y, - unsigned long long *__P) -{ - unsigned __int128 __res = (unsigned __int128) __X * __Y; - *__P = (unsigned long long) (__res >> 64); - return (unsigned long long) __res; -} - -#else /* !__x86_64__ */ - -static __inline__ unsigned int __DEFAULT_FN_ATTRS -_mulx_u32 (unsigned int __X, unsigned int __Y, unsigned int *__P) -{ - unsigned long long __res = (unsigned long long) __X * __Y; - *__P = (unsigned int) (__res >> 32); - return (unsigned int) __res; -} - -#endif /* !__x86_64__ */ - -#undef __DEFAULT_FN_ATTRS - -#endif /* __BMI2INTRIN_H */ diff --git a/include/bmiintrin.h b/include/bmiintrin.h deleted file mode 100644 index f583c21..0000000 --- a/include/bmiintrin.h +++ /dev/null @@ -1,427 +0,0 @@ -/*===---- bmiintrin.h - BMI intrinsics -------------------------------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ - -#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __BMIINTRIN_H -#define __BMIINTRIN_H - -/* Allow using the tzcnt intrinsics even for non-BMI targets. Since the TZCNT - instruction behaves as BSF on non-BMI targets, there is code that expects - to use it as a potentially faster version of BSF. */ -#define __RELAXED_FN_ATTRS __attribute__((__always_inline__, __nodebug__)) - -#define _tzcnt_u16(a) (__tzcnt_u16((a))) - -/// Counts the number of trailing zero bits in the operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the TZCNT instruction. -/// -/// \param __X -/// An unsigned 16-bit integer whose trailing zeros are to be counted. -/// \returns An unsigned 16-bit integer containing the number of trailing zero -/// bits in the operand. -static __inline__ unsigned short __RELAXED_FN_ATTRS -__tzcnt_u16(unsigned short __X) -{ - return __builtin_ia32_tzcnt_u16(__X); -} - -/// Counts the number of trailing zero bits in the operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the TZCNT instruction. -/// -/// \param __X -/// An unsigned 32-bit integer whose trailing zeros are to be counted. -/// \returns An unsigned 32-bit integer containing the number of trailing zero -/// bits in the operand. -static __inline__ unsigned int __RELAXED_FN_ATTRS -__tzcnt_u32(unsigned int __X) -{ - return __builtin_ia32_tzcnt_u32(__X); -} - -/// Counts the number of trailing zero bits in the operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the TZCNT instruction. -/// -/// \param __X -/// An unsigned 32-bit integer whose trailing zeros are to be counted. -/// \returns An 32-bit integer containing the number of trailing zero bits in -/// the operand. -static __inline__ int __RELAXED_FN_ATTRS -_mm_tzcnt_32(unsigned int __X) -{ - return __builtin_ia32_tzcnt_u32(__X); -} - -#define _tzcnt_u32(a) (__tzcnt_u32((a))) - -#ifdef __x86_64__ - -/// Counts the number of trailing zero bits in the operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the TZCNT instruction. -/// -/// \param __X -/// An unsigned 64-bit integer whose trailing zeros are to be counted. -/// \returns An unsigned 64-bit integer containing the number of trailing zero -/// bits in the operand. -static __inline__ unsigned long long __RELAXED_FN_ATTRS -__tzcnt_u64(unsigned long long __X) -{ - return __builtin_ia32_tzcnt_u64(__X); -} - -/// Counts the number of trailing zero bits in the operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the TZCNT instruction. -/// -/// \param __X -/// An unsigned 64-bit integer whose trailing zeros are to be counted. -/// \returns An 64-bit integer containing the number of trailing zero bits in -/// the operand. -static __inline__ long long __RELAXED_FN_ATTRS -_mm_tzcnt_64(unsigned long long __X) -{ - return __builtin_ia32_tzcnt_u64(__X); -} - -#define _tzcnt_u64(a) (__tzcnt_u64((a))) - -#endif /* __x86_64__ */ - -#undef __RELAXED_FN_ATTRS - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__BMI__) - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("bmi"))) - -#define _andn_u32(a, b) (__andn_u32((a), (b))) - -/* _bextr_u32 != __bextr_u32 */ -#define _blsi_u32(a) (__blsi_u32((a))) - -#define _blsmsk_u32(a) (__blsmsk_u32((a))) - -#define _blsr_u32(a) (__blsr_u32((a))) - -/// Performs a bitwise AND of the second operand with the one's -/// complement of the first operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the ANDN instruction. -/// -/// \param __X -/// An unsigned integer containing one of the operands. -/// \param __Y -/// An unsigned integer containing one of the operands. -/// \returns An unsigned integer containing the bitwise AND of the second -/// operand with the one's complement of the first operand. -static __inline__ unsigned int __DEFAULT_FN_ATTRS -__andn_u32(unsigned int __X, unsigned int __Y) -{ - return ~__X & __Y; -} - -/* AMD-specified, double-leading-underscore version of BEXTR */ -/// Extracts the specified bits from the first operand and returns them -/// in the least significant bits of the result. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the BEXTR instruction. -/// -/// \param __X -/// An unsigned integer whose bits are to be extracted. -/// \param __Y -/// An unsigned integer used to specify which bits are extracted. Bits [7:0] -/// specify the index of the least significant bit. Bits [15:8] specify the -/// number of bits to be extracted. -/// \returns An unsigned integer whose least significant bits contain the -/// extracted bits. -/// \see _bextr_u32 -static __inline__ unsigned int __DEFAULT_FN_ATTRS -__bextr_u32(unsigned int __X, unsigned int __Y) -{ - return __builtin_ia32_bextr_u32(__X, __Y); -} - -/* Intel-specified, single-leading-underscore version of BEXTR */ -/// Extracts the specified bits from the first operand and returns them -/// in the least significant bits of the result. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the BEXTR instruction. -/// -/// \param __X -/// An unsigned integer whose bits are to be extracted. -/// \param __Y -/// An unsigned integer used to specify the index of the least significant -/// bit for the bits to be extracted. Bits [7:0] specify the index. -/// \param __Z -/// An unsigned integer used to specify the number of bits to be extracted. -/// Bits [7:0] specify the number of bits. -/// \returns An unsigned integer whose least significant bits contain the -/// extracted bits. -/// \see __bextr_u32 -static __inline__ unsigned int __DEFAULT_FN_ATTRS -_bextr_u32(unsigned int __X, unsigned int __Y, unsigned int __Z) -{ - return __builtin_ia32_bextr_u32 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8))); -} - -/* Intel-specified, single-leading-underscore version of BEXTR2 */ -/// Extracts the specified bits from the first operand and returns them -/// in the least significant bits of the result. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the BEXTR instruction. -/// -/// \param __X -/// An unsigned integer whose bits are to be extracted. -/// \param __Y -/// An unsigned integer used to specify which bits are extracted. Bits [7:0] -/// specify the index of the least significant bit. Bits [15:8] specify the -/// number of bits to be extracted. -/// \returns An unsigned integer whose least significant bits contain the -/// extracted bits. -/// \see __bextr_u32 -static __inline__ unsigned int __DEFAULT_FN_ATTRS -_bextr2_u32(unsigned int __X, unsigned int __Y) { - return __builtin_ia32_bextr_u32(__X, __Y); -} - -/// Clears all bits in the source except for the least significant bit -/// containing a value of 1 and returns the result. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the BLSI instruction. -/// -/// \param __X -/// An unsigned integer whose bits are to be cleared. -/// \returns An unsigned integer containing the result of clearing the bits from -/// the source operand. -static __inline__ unsigned int __DEFAULT_FN_ATTRS -__blsi_u32(unsigned int __X) -{ - return __X & -__X; -} - -/// Creates a mask whose bits are set to 1, using bit 0 up to and -/// including the least significant bit that is set to 1 in the source -/// operand and returns the result. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the BLSMSK instruction. -/// -/// \param __X -/// An unsigned integer used to create the mask. -/// \returns An unsigned integer containing the newly created mask. -static __inline__ unsigned int __DEFAULT_FN_ATTRS -__blsmsk_u32(unsigned int __X) -{ - return __X ^ (__X - 1); -} - -/// Clears the least significant bit that is set to 1 in the source -/// operand and returns the result. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the BLSR instruction. -/// -/// \param __X -/// An unsigned integer containing the operand to be cleared. -/// \returns An unsigned integer containing the result of clearing the source -/// operand. -static __inline__ unsigned int __DEFAULT_FN_ATTRS -__blsr_u32(unsigned int __X) -{ - return __X & (__X - 1); -} - -#ifdef __x86_64__ - -#define _andn_u64(a, b) (__andn_u64((a), (b))) - -/* _bextr_u64 != __bextr_u64 */ -#define _blsi_u64(a) (__blsi_u64((a))) - -#define _blsmsk_u64(a) (__blsmsk_u64((a))) - -#define _blsr_u64(a) (__blsr_u64((a))) - -/// Performs a bitwise AND of the second operand with the one's -/// complement of the first operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the ANDN instruction. -/// -/// \param __X -/// An unsigned 64-bit integer containing one of the operands. -/// \param __Y -/// An unsigned 64-bit integer containing one of the operands. -/// \returns An unsigned 64-bit integer containing the bitwise AND of the second -/// operand with the one's complement of the first operand. -static __inline__ unsigned long long __DEFAULT_FN_ATTRS -__andn_u64 (unsigned long long __X, unsigned long long __Y) -{ - return ~__X & __Y; -} - -/* AMD-specified, double-leading-underscore version of BEXTR */ -/// Extracts the specified bits from the first operand and returns them -/// in the least significant bits of the result. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the BEXTR instruction. -/// -/// \param __X -/// An unsigned 64-bit integer whose bits are to be extracted. -/// \param __Y -/// An unsigned 64-bit integer used to specify which bits are extracted. Bits -/// [7:0] specify the index of the least significant bit. Bits [15:8] specify -/// the number of bits to be extracted. -/// \returns An unsigned 64-bit integer whose least significant bits contain the -/// extracted bits. -/// \see _bextr_u64 -static __inline__ unsigned long long __DEFAULT_FN_ATTRS -__bextr_u64(unsigned long long __X, unsigned long long __Y) -{ - return __builtin_ia32_bextr_u64(__X, __Y); -} - -/* Intel-specified, single-leading-underscore version of BEXTR */ -/// Extracts the specified bits from the first operand and returns them -/// in the least significant bits of the result. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the BEXTR instruction. -/// -/// \param __X -/// An unsigned 64-bit integer whose bits are to be extracted. -/// \param __Y -/// An unsigned integer used to specify the index of the least significant -/// bit for the bits to be extracted. Bits [7:0] specify the index. -/// \param __Z -/// An unsigned integer used to specify the number of bits to be extracted. -/// Bits [7:0] specify the number of bits. -/// \returns An unsigned 64-bit integer whose least significant bits contain the -/// extracted bits. -/// \see __bextr_u64 -static __inline__ unsigned long long __DEFAULT_FN_ATTRS -_bextr_u64(unsigned long long __X, unsigned int __Y, unsigned int __Z) -{ - return __builtin_ia32_bextr_u64 (__X, ((__Y & 0xff) | ((__Z & 0xff) << 8))); -} - -/* Intel-specified, single-leading-underscore version of BEXTR2 */ -/// Extracts the specified bits from the first operand and returns them -/// in the least significant bits of the result. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the BEXTR instruction. -/// -/// \param __X -/// An unsigned 64-bit integer whose bits are to be extracted. -/// \param __Y -/// An unsigned 64-bit integer used to specify which bits are extracted. Bits -/// [7:0] specify the index of the least significant bit. Bits [15:8] specify -/// the number of bits to be extracted. -/// \returns An unsigned 64-bit integer whose least significant bits contain the -/// extracted bits. -/// \see __bextr_u64 -static __inline__ unsigned long long __DEFAULT_FN_ATTRS -_bextr2_u64(unsigned long long __X, unsigned long long __Y) { - return __builtin_ia32_bextr_u64(__X, __Y); -} - -/// Clears all bits in the source except for the least significant bit -/// containing a value of 1 and returns the result. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the BLSI instruction. -/// -/// \param __X -/// An unsigned 64-bit integer whose bits are to be cleared. -/// \returns An unsigned 64-bit integer containing the result of clearing the -/// bits from the source operand. -static __inline__ unsigned long long __DEFAULT_FN_ATTRS -__blsi_u64(unsigned long long __X) -{ - return __X & -__X; -} - -/// Creates a mask whose bits are set to 1, using bit 0 up to and -/// including the least significant bit that is set to 1 in the source -/// operand and returns the result. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the BLSMSK instruction. -/// -/// \param __X -/// An unsigned 64-bit integer used to create the mask. -/// \returns An unsigned 64-bit integer containing the newly created mask. -static __inline__ unsigned long long __DEFAULT_FN_ATTRS -__blsmsk_u64(unsigned long long __X) -{ - return __X ^ (__X - 1); -} - -/// Clears the least significant bit that is set to 1 in the source -/// operand and returns the result. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the BLSR instruction. -/// -/// \param __X -/// An unsigned 64-bit integer containing the operand to be cleared. -/// \returns An unsigned 64-bit integer containing the result of clearing the -/// source operand. -static __inline__ unsigned long long __DEFAULT_FN_ATTRS -__blsr_u64(unsigned long long __X) -{ - return __X & (__X - 1); -} - -#endif /* __x86_64__ */ - -#undef __DEFAULT_FN_ATTRS - -#endif /* !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) \ - || defined(__BMI__) */ - -#endif /* __BMIINTRIN_H */ diff --git a/include/cetintrin.h b/include/cetintrin.h deleted file mode 100644 index 019cab0..0000000 --- a/include/cetintrin.h +++ /dev/null @@ -1,109 +0,0 @@ -/*===---- cetintrin.h - CET intrinsic --------------------------------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ - -#ifndef __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __CETINTRIN_H -#define __CETINTRIN_H - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS \ - __attribute__((__always_inline__, __nodebug__, __target__("shstk"))) - -static __inline__ void __DEFAULT_FN_ATTRS _incsspd(int __a) { - __builtin_ia32_incsspd(__a); -} - -#ifdef __x86_64__ -static __inline__ void __DEFAULT_FN_ATTRS _incsspq(unsigned long long __a) { - __builtin_ia32_incsspq(__a); -} -#endif /* __x86_64__ */ - -#ifdef __x86_64__ -static __inline__ void __DEFAULT_FN_ATTRS _inc_ssp(unsigned int __a) { - __builtin_ia32_incsspq(__a); -} -#else /* __x86_64__ */ -static __inline__ void __DEFAULT_FN_ATTRS _inc_ssp(unsigned int __a) { - __builtin_ia32_incsspd((int)__a); -} -#endif /* __x86_64__ */ - -static __inline__ unsigned int __DEFAULT_FN_ATTRS _rdsspd(unsigned int __a) { - return __builtin_ia32_rdsspd(__a); -} - -static __inline__ unsigned int __DEFAULT_FN_ATTRS _rdsspd_i32() { - unsigned int t; - return __builtin_ia32_rdsspd(t); -} - -#ifdef __x86_64__ -static __inline__ unsigned long long __DEFAULT_FN_ATTRS _rdsspq(unsigned long long __a) { - return __builtin_ia32_rdsspq(__a); -} - -static __inline__ unsigned long long __DEFAULT_FN_ATTRS _rdsspq_i64() { - unsigned long long t; - return __builtin_ia32_rdsspq(t); -} -#endif /* __x86_64__ */ - -#ifdef __x86_64__ -static __inline__ unsigned long long __DEFAULT_FN_ATTRS _get_ssp(void) { - return __builtin_ia32_rdsspq(0); -} -#else /* __x86_64__ */ -static __inline__ unsigned int __DEFAULT_FN_ATTRS _get_ssp(void) { - return __builtin_ia32_rdsspd(0); -} -#endif /* __x86_64__ */ - -static __inline__ void __DEFAULT_FN_ATTRS _saveprevssp() { - __builtin_ia32_saveprevssp(); -} - -static __inline__ void __DEFAULT_FN_ATTRS _rstorssp(void * __p) { - __builtin_ia32_rstorssp(__p); -} - -static __inline__ void __DEFAULT_FN_ATTRS _wrssd(unsigned int __a, void * __p) { - __builtin_ia32_wrssd(__a, __p); -} - -#ifdef __x86_64__ -static __inline__ void __DEFAULT_FN_ATTRS _wrssq(unsigned long long __a, void * __p) { - __builtin_ia32_wrssq(__a, __p); -} -#endif /* __x86_64__ */ - -static __inline__ void __DEFAULT_FN_ATTRS _wrussd(unsigned int __a, void * __p) { - __builtin_ia32_wrussd(__a, __p); -} - -#ifdef __x86_64__ -static __inline__ void __DEFAULT_FN_ATTRS _wrussq(unsigned long long __a, void * __p) { - __builtin_ia32_wrussq(__a, __p); -} -#endif /* __x86_64__ */ - -static __inline__ void __DEFAULT_FN_ATTRS _setssbsy() { - __builtin_ia32_setssbsy(); -} - -static __inline__ void __DEFAULT_FN_ATTRS _clrssbsy(void * __p) { - __builtin_ia32_clrssbsy(__p); -} - -#undef __DEFAULT_FN_ATTRS - -#endif /* __CETINTRIN_H */ diff --git a/include/cldemoteintrin.h b/include/cldemoteintrin.h deleted file mode 100644 index cfb951c..0000000 --- a/include/cldemoteintrin.h +++ /dev/null @@ -1,36 +0,0 @@ -/*===---- cldemoteintrin.h - CLDEMOTE intrinsic ----------------------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ - -#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __CLDEMOTEINTRIN_H -#define __CLDEMOTEINTRIN_H - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS \ - __attribute__((__always_inline__, __nodebug__, __target__("cldemote"))) - -/// Hint to hardware that the cache line that contains \p __P should be demoted -/// from the cache closest to the processor core to a level more distant from -/// the processor core. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the CLDEMOTE instruction. -static __inline__ void __DEFAULT_FN_ATTRS -_cldemote(const void * __P) { - __builtin_ia32_cldemote(__P); -} - -#define _mm_cldemote(p) _cldemote(p) -#undef __DEFAULT_FN_ATTRS - -#endif diff --git a/include/clflushoptintrin.h b/include/clflushoptintrin.h deleted file mode 100644 index 060eb36..0000000 --- a/include/clflushoptintrin.h +++ /dev/null @@ -1,27 +0,0 @@ -/*===---- clflushoptintrin.h - CLFLUSHOPT intrinsic ------------------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ - -#ifndef __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __CLFLUSHOPTINTRIN_H -#define __CLFLUSHOPTINTRIN_H - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("clflushopt"))) - -static __inline__ void __DEFAULT_FN_ATTRS -_mm_clflushopt(void const * __m) { - __builtin_ia32_clflushopt(__m); -} - -#undef __DEFAULT_FN_ATTRS - -#endif diff --git a/include/clwbintrin.h b/include/clwbintrin.h deleted file mode 100644 index 3360d20..0000000 --- a/include/clwbintrin.h +++ /dev/null @@ -1,38 +0,0 @@ -/*===---- clwbintrin.h - CLWB intrinsic ------------------------------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ - -#ifndef __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __CLWBINTRIN_H -#define __CLWBINTRIN_H - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("clwb"))) - -/// Writes back to memory the cache line (if modified) that contains the -/// linear address specified in \a __p from any level of the cache hierarchy in -/// the cache coherence domain -/// -/// \headerfile -/// -/// This intrinsic corresponds to the CLWB instruction. -/// -/// \param __p -/// A pointer to the memory location used to identify the cache line to be -/// written back. -static __inline__ void __DEFAULT_FN_ATTRS -_mm_clwb(void const *__p) { - __builtin_ia32_clwb(__p); -} - -#undef __DEFAULT_FN_ATTRS - -#endif diff --git a/include/clzerointrin.h b/include/clzerointrin.h deleted file mode 100644 index a180984..0000000 --- a/include/clzerointrin.h +++ /dev/null @@ -1,36 +0,0 @@ -/*===----------------------- clzerointrin.h - CLZERO ----------------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ -#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __CLZEROINTRIN_H -#define __CLZEROINTRIN_H - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS \ - __attribute__((__always_inline__, __nodebug__, __target__("clzero"))) - -/// Loads the cache line address and zero's out the cacheline -/// -/// \headerfile -/// -/// This intrinsic corresponds to the CLZERO instruction. -/// -/// \param __line -/// A pointer to a cacheline which needs to be zeroed out. -static __inline__ void __DEFAULT_FN_ATTRS -_mm_clzero (void * __line) -{ - __builtin_ia32_clzero ((void *)__line); -} - -#undef __DEFAULT_FN_ATTRS - -#endif /* __CLZEROINTRIN_H */ diff --git a/include/crc32intrin.h b/include/crc32intrin.h deleted file mode 100644 index a0bd99d..0000000 --- a/include/crc32intrin.h +++ /dev/null @@ -1,100 +0,0 @@ -/*===---- crc32intrin.h - SSE4.2 Accumulate CRC32 intrinsics ---------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ - -#ifndef __CRC32INTRIN_H -#define __CRC32INTRIN_H - -#define __DEFAULT_FN_ATTRS \ - __attribute__((__always_inline__, __nodebug__, __target__("crc32"))) - -/// Adds the unsigned integer operand to the CRC-32C checksum of the -/// unsigned char operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the CRC32B instruction. -/// -/// \param __C -/// An unsigned integer operand to add to the CRC-32C checksum of operand -/// \a __D. -/// \param __D -/// An unsigned 8-bit integer operand used to compute the CRC-32C checksum. -/// \returns The result of adding operand \a __C to the CRC-32C checksum of -/// operand \a __D. -static __inline__ unsigned int __DEFAULT_FN_ATTRS -_mm_crc32_u8(unsigned int __C, unsigned char __D) -{ - return __builtin_ia32_crc32qi(__C, __D); -} - -/// Adds the unsigned integer operand to the CRC-32C checksum of the -/// unsigned short operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the CRC32W instruction. -/// -/// \param __C -/// An unsigned integer operand to add to the CRC-32C checksum of operand -/// \a __D. -/// \param __D -/// An unsigned 16-bit integer operand used to compute the CRC-32C checksum. -/// \returns The result of adding operand \a __C to the CRC-32C checksum of -/// operand \a __D. -static __inline__ unsigned int __DEFAULT_FN_ATTRS -_mm_crc32_u16(unsigned int __C, unsigned short __D) -{ - return __builtin_ia32_crc32hi(__C, __D); -} - -/// Adds the first unsigned integer operand to the CRC-32C checksum of -/// the second unsigned integer operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the CRC32L instruction. -/// -/// \param __C -/// An unsigned integer operand to add to the CRC-32C checksum of operand -/// \a __D. -/// \param __D -/// An unsigned 32-bit integer operand used to compute the CRC-32C checksum. -/// \returns The result of adding operand \a __C to the CRC-32C checksum of -/// operand \a __D. -static __inline__ unsigned int __DEFAULT_FN_ATTRS -_mm_crc32_u32(unsigned int __C, unsigned int __D) -{ - return __builtin_ia32_crc32si(__C, __D); -} - -#ifdef __x86_64__ -/// Adds the unsigned integer operand to the CRC-32C checksum of the -/// unsigned 64-bit integer operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the CRC32Q instruction. -/// -/// \param __C -/// An unsigned integer operand to add to the CRC-32C checksum of operand -/// \a __D. -/// \param __D -/// An unsigned 64-bit integer operand used to compute the CRC-32C checksum. -/// \returns The result of adding operand \a __C to the CRC-32C checksum of -/// operand \a __D. -static __inline__ unsigned long long __DEFAULT_FN_ATTRS -_mm_crc32_u64(unsigned long long __C, unsigned long long __D) -{ - return __builtin_ia32_crc32di(__C, __D); -} -#endif /* __x86_64__ */ - -#undef __DEFAULT_FN_ATTRS - -#endif /* __CRC32INTRIN_H */ diff --git a/include/emmintrin.h b/include/emmintrin.h deleted file mode 100644 index e00968e..0000000 --- a/include/emmintrin.h +++ /dev/null @@ -1,5045 +0,0 @@ -/*===---- emmintrin.h - SSE2 intrinsics ------------------------------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ - -#ifndef __EMMINTRIN_H -#define __EMMINTRIN_H - -#if !defined(__i386__) && !defined(__x86_64__) -#error "This header is only meant to be used on x86 and x64 architecture" -#endif - -#include - -typedef double __m128d __attribute__((__vector_size__(16), __aligned__(16))); -typedef long long __m128i __attribute__((__vector_size__(16), __aligned__(16))); - -typedef double __m128d_u __attribute__((__vector_size__(16), __aligned__(1))); -typedef long long __m128i_u __attribute__((__vector_size__(16), __aligned__(1))); - -/* Type defines. */ -typedef double __v2df __attribute__ ((__vector_size__ (16))); -typedef long long __v2di __attribute__ ((__vector_size__ (16))); -typedef short __v8hi __attribute__((__vector_size__(16))); -typedef char __v16qi __attribute__((__vector_size__(16))); - -/* Unsigned types */ -typedef unsigned long long __v2du __attribute__ ((__vector_size__ (16))); -typedef unsigned short __v8hu __attribute__((__vector_size__(16))); -typedef unsigned char __v16qu __attribute__((__vector_size__(16))); - -/* We need an explicitly signed variant for char. Note that this shouldn't - * appear in the interface though. */ -typedef signed char __v16qs __attribute__((__vector_size__(16))); - -#if (__clang_major__ > 15) -#ifdef __SSE2__ -/* Both _Float16 and __bf16 require SSE2 being enabled. */ -typedef _Float16 __v8hf __attribute__((__vector_size__(16), __aligned__(16))); -typedef _Float16 __m128h __attribute__((__vector_size__(16), __aligned__(16))); -typedef _Float16 __m128h_u __attribute__((__vector_size__(16), __aligned__(1))); - -typedef __bf16 __v8bf __attribute__((__vector_size__(16), __aligned__(16))); -typedef __bf16 __m128bh __attribute__((__vector_size__(16), __aligned__(16))); -#endif -#endif - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse2"), __min_vector_width__(128))) -#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse2"), __min_vector_width__(64))) - -/// Adds lower double-precision values in both operands and returns the -/// sum in the lower 64 bits of the result. The upper 64 bits of the result -/// are copied from the upper double-precision value of the first operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VADDSD / ADDSD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double] containing one of the source operands. -/// \param __b -/// A 128-bit vector of [2 x double] containing one of the source operands. -/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the -/// sum of the lower 64 bits of both operands. The upper 64 bits are copied -/// from the upper 64 bits of the first source operand. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_add_sd(__m128d __a, __m128d __b) -{ - __a[0] += __b[0]; - return __a; -} - -/// Adds two 128-bit vectors of [2 x double]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VADDPD / ADDPD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double] containing one of the source operands. -/// \param __b -/// A 128-bit vector of [2 x double] containing one of the source operands. -/// \returns A 128-bit vector of [2 x double] containing the sums of both -/// operands. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_add_pd(__m128d __a, __m128d __b) -{ - return (__m128d)((__v2df)__a + (__v2df)__b); -} - -/// Subtracts the lower double-precision value of the second operand -/// from the lower double-precision value of the first operand and returns -/// the difference in the lower 64 bits of the result. The upper 64 bits of -/// the result are copied from the upper double-precision value of the first -/// operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VSUBSD / SUBSD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double] containing the minuend. -/// \param __b -/// A 128-bit vector of [2 x double] containing the subtrahend. -/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the -/// difference of the lower 64 bits of both operands. The upper 64 bits are -/// copied from the upper 64 bits of the first source operand. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_sub_sd(__m128d __a, __m128d __b) -{ - __a[0] -= __b[0]; - return __a; -} - -/// Subtracts two 128-bit vectors of [2 x double]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VSUBPD / SUBPD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double] containing the minuend. -/// \param __b -/// A 128-bit vector of [2 x double] containing the subtrahend. -/// \returns A 128-bit vector of [2 x double] containing the differences between -/// both operands. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_sub_pd(__m128d __a, __m128d __b) -{ - return (__m128d)((__v2df)__a - (__v2df)__b); -} - -/// Multiplies lower double-precision values in both operands and returns -/// the product in the lower 64 bits of the result. The upper 64 bits of the -/// result are copied from the upper double-precision value of the first -/// operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMULSD / MULSD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double] containing one of the source operands. -/// \param __b -/// A 128-bit vector of [2 x double] containing one of the source operands. -/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the -/// product of the lower 64 bits of both operands. The upper 64 bits are -/// copied from the upper 64 bits of the first source operand. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_mul_sd(__m128d __a, __m128d __b) -{ - __a[0] *= __b[0]; - return __a; -} - -/// Multiplies two 128-bit vectors of [2 x double]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMULPD / MULPD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double] containing one of the operands. -/// \param __b -/// A 128-bit vector of [2 x double] containing one of the operands. -/// \returns A 128-bit vector of [2 x double] containing the products of both -/// operands. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_mul_pd(__m128d __a, __m128d __b) -{ - return (__m128d)((__v2df)__a * (__v2df)__b); -} - -/// Divides the lower double-precision value of the first operand by the -/// lower double-precision value of the second operand and returns the -/// quotient in the lower 64 bits of the result. The upper 64 bits of the -/// result are copied from the upper double-precision value of the first -/// operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VDIVSD / DIVSD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double] containing the dividend. -/// \param __b -/// A 128-bit vector of [2 x double] containing divisor. -/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the -/// quotient of the lower 64 bits of both operands. The upper 64 bits are -/// copied from the upper 64 bits of the first source operand. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_div_sd(__m128d __a, __m128d __b) -{ - __a[0] /= __b[0]; - return __a; -} - -/// Performs an element-by-element division of two 128-bit vectors of -/// [2 x double]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VDIVPD / DIVPD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double] containing the dividend. -/// \param __b -/// A 128-bit vector of [2 x double] containing the divisor. -/// \returns A 128-bit vector of [2 x double] containing the quotients of both -/// operands. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_div_pd(__m128d __a, __m128d __b) -{ - return (__m128d)((__v2df)__a / (__v2df)__b); -} - -/// Calculates the square root of the lower double-precision value of -/// the second operand and returns it in the lower 64 bits of the result. -/// The upper 64 bits of the result are copied from the upper -/// double-precision value of the first operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VSQRTSD / SQRTSD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double] containing one of the operands. The -/// upper 64 bits of this operand are copied to the upper 64 bits of the -/// result. -/// \param __b -/// A 128-bit vector of [2 x double] containing one of the operands. The -/// square root is calculated using the lower 64 bits of this operand. -/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the -/// square root of the lower 64 bits of operand \a __b, and whose upper 64 -/// bits are copied from the upper 64 bits of operand \a __a. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_sqrt_sd(__m128d __a, __m128d __b) -{ - __m128d __c = __builtin_ia32_sqrtsd((__v2df)__b); - return __extension__ (__m128d) { __c[0], __a[1] }; -} - -/// Calculates the square root of the each of two values stored in a -/// 128-bit vector of [2 x double]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VSQRTPD / SQRTPD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. -/// \returns A 128-bit vector of [2 x double] containing the square roots of the -/// values in the operand. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_sqrt_pd(__m128d __a) -{ - return __builtin_ia32_sqrtpd((__v2df)__a); -} - -/// Compares lower 64-bit double-precision values of both operands, and -/// returns the lesser of the pair of values in the lower 64-bits of the -/// result. The upper 64 bits of the result are copied from the upper -/// double-precision value of the first operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMINSD / MINSD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double] containing one of the operands. The -/// lower 64 bits of this operand are used in the comparison. -/// \param __b -/// A 128-bit vector of [2 x double] containing one of the operands. The -/// lower 64 bits of this operand are used in the comparison. -/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the -/// minimum value between both operands. The upper 64 bits are copied from -/// the upper 64 bits of the first source operand. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_min_sd(__m128d __a, __m128d __b) -{ - return __builtin_ia32_minsd((__v2df)__a, (__v2df)__b); -} - -/// Performs element-by-element comparison of the two 128-bit vectors of -/// [2 x double] and returns the vector containing the lesser of each pair of -/// values. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMINPD / MINPD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double] containing one of the operands. -/// \param __b -/// A 128-bit vector of [2 x double] containing one of the operands. -/// \returns A 128-bit vector of [2 x double] containing the minimum values -/// between both operands. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_min_pd(__m128d __a, __m128d __b) -{ - return __builtin_ia32_minpd((__v2df)__a, (__v2df)__b); -} - -/// Compares lower 64-bit double-precision values of both operands, and -/// returns the greater of the pair of values in the lower 64-bits of the -/// result. The upper 64 bits of the result are copied from the upper -/// double-precision value of the first operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMAXSD / MAXSD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double] containing one of the operands. The -/// lower 64 bits of this operand are used in the comparison. -/// \param __b -/// A 128-bit vector of [2 x double] containing one of the operands. The -/// lower 64 bits of this operand are used in the comparison. -/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the -/// maximum value between both operands. The upper 64 bits are copied from -/// the upper 64 bits of the first source operand. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_max_sd(__m128d __a, __m128d __b) -{ - return __builtin_ia32_maxsd((__v2df)__a, (__v2df)__b); -} - -/// Performs element-by-element comparison of the two 128-bit vectors of -/// [2 x double] and returns the vector containing the greater of each pair -/// of values. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMAXPD / MAXPD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double] containing one of the operands. -/// \param __b -/// A 128-bit vector of [2 x double] containing one of the operands. -/// \returns A 128-bit vector of [2 x double] containing the maximum values -/// between both operands. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_max_pd(__m128d __a, __m128d __b) -{ - return __builtin_ia32_maxpd((__v2df)__a, (__v2df)__b); -} - -/// Performs a bitwise AND of two 128-bit vectors of [2 x double]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPAND / PAND instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double] containing one of the source operands. -/// \param __b -/// A 128-bit vector of [2 x double] containing one of the source operands. -/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the -/// values between both operands. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_and_pd(__m128d __a, __m128d __b) -{ - return (__m128d)((__v2du)__a & (__v2du)__b); -} - -/// Performs a bitwise AND of two 128-bit vectors of [2 x double], using -/// the one's complement of the values contained in the first source operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPANDN / PANDN instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double] containing the left source operand. The -/// one's complement of this value is used in the bitwise AND. -/// \param __b -/// A 128-bit vector of [2 x double] containing the right source operand. -/// \returns A 128-bit vector of [2 x double] containing the bitwise AND of the -/// values in the second operand and the one's complement of the first -/// operand. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_andnot_pd(__m128d __a, __m128d __b) -{ - return (__m128d)(~(__v2du)__a & (__v2du)__b); -} - -/// Performs a bitwise OR of two 128-bit vectors of [2 x double]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPOR / POR instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double] containing one of the source operands. -/// \param __b -/// A 128-bit vector of [2 x double] containing one of the source operands. -/// \returns A 128-bit vector of [2 x double] containing the bitwise OR of the -/// values between both operands. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_or_pd(__m128d __a, __m128d __b) -{ - return (__m128d)((__v2du)__a | (__v2du)__b); -} - -/// Performs a bitwise XOR of two 128-bit vectors of [2 x double]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPXOR / PXOR instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double] containing one of the source operands. -/// \param __b -/// A 128-bit vector of [2 x double] containing one of the source operands. -/// \returns A 128-bit vector of [2 x double] containing the bitwise XOR of the -/// values between both operands. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_xor_pd(__m128d __a, __m128d __b) -{ - return (__m128d)((__v2du)__a ^ (__v2du)__b); -} - -/// Compares each of the corresponding double-precision values of the -/// 128-bit vectors of [2 x double] for equality. Each comparison yields 0x0 -/// for false, 0xFFFFFFFFFFFFFFFF for true. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCMPEQPD / CMPEQPD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. -/// \param __b -/// A 128-bit vector of [2 x double]. -/// \returns A 128-bit vector containing the comparison results. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_cmpeq_pd(__m128d __a, __m128d __b) -{ - return (__m128d)__builtin_ia32_cmpeqpd((__v2df)__a, (__v2df)__b); -} - -/// Compares each of the corresponding double-precision values of the -/// 128-bit vectors of [2 x double] to determine if the values in the first -/// operand are less than those in the second operand. Each comparison -/// yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCMPLTPD / CMPLTPD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. -/// \param __b -/// A 128-bit vector of [2 x double]. -/// \returns A 128-bit vector containing the comparison results. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_cmplt_pd(__m128d __a, __m128d __b) -{ - return (__m128d)__builtin_ia32_cmpltpd((__v2df)__a, (__v2df)__b); -} - -/// Compares each of the corresponding double-precision values of the -/// 128-bit vectors of [2 x double] to determine if the values in the first -/// operand are less than or equal to those in the second operand. -/// -/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCMPLEPD / CMPLEPD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. -/// \param __b -/// A 128-bit vector of [2 x double]. -/// \returns A 128-bit vector containing the comparison results. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_cmple_pd(__m128d __a, __m128d __b) -{ - return (__m128d)__builtin_ia32_cmplepd((__v2df)__a, (__v2df)__b); -} - -/// Compares each of the corresponding double-precision values of the -/// 128-bit vectors of [2 x double] to determine if the values in the first -/// operand are greater than those in the second operand. -/// -/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCMPLTPD / CMPLTPD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. -/// \param __b -/// A 128-bit vector of [2 x double]. -/// \returns A 128-bit vector containing the comparison results. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_cmpgt_pd(__m128d __a, __m128d __b) -{ - return (__m128d)__builtin_ia32_cmpltpd((__v2df)__b, (__v2df)__a); -} - -/// Compares each of the corresponding double-precision values of the -/// 128-bit vectors of [2 x double] to determine if the values in the first -/// operand are greater than or equal to those in the second operand. -/// -/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCMPLEPD / CMPLEPD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. -/// \param __b -/// A 128-bit vector of [2 x double]. -/// \returns A 128-bit vector containing the comparison results. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_cmpge_pd(__m128d __a, __m128d __b) -{ - return (__m128d)__builtin_ia32_cmplepd((__v2df)__b, (__v2df)__a); -} - -/// Compares each of the corresponding double-precision values of the -/// 128-bit vectors of [2 x double] to determine if the values in the first -/// operand are ordered with respect to those in the second operand. -/// -/// A pair of double-precision values are "ordered" with respect to each -/// other if neither value is a NaN. Each comparison yields 0x0 for false, -/// 0xFFFFFFFFFFFFFFFF for true. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCMPORDPD / CMPORDPD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. -/// \param __b -/// A 128-bit vector of [2 x double]. -/// \returns A 128-bit vector containing the comparison results. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_cmpord_pd(__m128d __a, __m128d __b) -{ - return (__m128d)__builtin_ia32_cmpordpd((__v2df)__a, (__v2df)__b); -} - -/// Compares each of the corresponding double-precision values of the -/// 128-bit vectors of [2 x double] to determine if the values in the first -/// operand are unordered with respect to those in the second operand. -/// -/// A pair of double-precision values are "unordered" with respect to each -/// other if one or both values are NaN. Each comparison yields 0x0 for -/// false, 0xFFFFFFFFFFFFFFFF for true. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCMPUNORDPD / CMPUNORDPD -/// instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. -/// \param __b -/// A 128-bit vector of [2 x double]. -/// \returns A 128-bit vector containing the comparison results. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_cmpunord_pd(__m128d __a, __m128d __b) -{ - return (__m128d)__builtin_ia32_cmpunordpd((__v2df)__a, (__v2df)__b); -} - -/// Compares each of the corresponding double-precision values of the -/// 128-bit vectors of [2 x double] to determine if the values in the first -/// operand are unequal to those in the second operand. -/// -/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCMPNEQPD / CMPNEQPD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. -/// \param __b -/// A 128-bit vector of [2 x double]. -/// \returns A 128-bit vector containing the comparison results. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_cmpneq_pd(__m128d __a, __m128d __b) -{ - return (__m128d)__builtin_ia32_cmpneqpd((__v2df)__a, (__v2df)__b); -} - -/// Compares each of the corresponding double-precision values of the -/// 128-bit vectors of [2 x double] to determine if the values in the first -/// operand are not less than those in the second operand. -/// -/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCMPNLTPD / CMPNLTPD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. -/// \param __b -/// A 128-bit vector of [2 x double]. -/// \returns A 128-bit vector containing the comparison results. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_cmpnlt_pd(__m128d __a, __m128d __b) -{ - return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__a, (__v2df)__b); -} - -/// Compares each of the corresponding double-precision values of the -/// 128-bit vectors of [2 x double] to determine if the values in the first -/// operand are not less than or equal to those in the second operand. -/// -/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCMPNLEPD / CMPNLEPD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. -/// \param __b -/// A 128-bit vector of [2 x double]. -/// \returns A 128-bit vector containing the comparison results. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_cmpnle_pd(__m128d __a, __m128d __b) -{ - return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__a, (__v2df)__b); -} - -/// Compares each of the corresponding double-precision values of the -/// 128-bit vectors of [2 x double] to determine if the values in the first -/// operand are not greater than those in the second operand. -/// -/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCMPNLTPD / CMPNLTPD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. -/// \param __b -/// A 128-bit vector of [2 x double]. -/// \returns A 128-bit vector containing the comparison results. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_cmpngt_pd(__m128d __a, __m128d __b) -{ - return (__m128d)__builtin_ia32_cmpnltpd((__v2df)__b, (__v2df)__a); -} - -/// Compares each of the corresponding double-precision values of the -/// 128-bit vectors of [2 x double] to determine if the values in the first -/// operand are not greater than or equal to those in the second operand. -/// -/// Each comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCMPNLEPD / CMPNLEPD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. -/// \param __b -/// A 128-bit vector of [2 x double]. -/// \returns A 128-bit vector containing the comparison results. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_cmpnge_pd(__m128d __a, __m128d __b) -{ - return (__m128d)__builtin_ia32_cmpnlepd((__v2df)__b, (__v2df)__a); -} - -/// Compares the lower double-precision floating-point values in each of -/// the two 128-bit floating-point vectors of [2 x double] for equality. -/// -/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCMPEQSD / CMPEQSD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. The lower double-precision value is -/// compared to the lower double-precision value of \a __b. -/// \param __b -/// A 128-bit vector of [2 x double]. The lower double-precision value is -/// compared to the lower double-precision value of \a __a. -/// \returns A 128-bit vector. The lower 64 bits contains the comparison -/// results. The upper 64 bits are copied from the upper 64 bits of \a __a. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_cmpeq_sd(__m128d __a, __m128d __b) -{ - return (__m128d)__builtin_ia32_cmpeqsd((__v2df)__a, (__v2df)__b); -} - -/// Compares the lower double-precision floating-point values in each of -/// the two 128-bit floating-point vectors of [2 x double] to determine if -/// the value in the first parameter is less than the corresponding value in -/// the second parameter. -/// -/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCMPLTSD / CMPLTSD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. The lower double-precision value is -/// compared to the lower double-precision value of \a __b. -/// \param __b -/// A 128-bit vector of [2 x double]. The lower double-precision value is -/// compared to the lower double-precision value of \a __a. -/// \returns A 128-bit vector. The lower 64 bits contains the comparison -/// results. The upper 64 bits are copied from the upper 64 bits of \a __a. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_cmplt_sd(__m128d __a, __m128d __b) -{ - return (__m128d)__builtin_ia32_cmpltsd((__v2df)__a, (__v2df)__b); -} - -/// Compares the lower double-precision floating-point values in each of -/// the two 128-bit floating-point vectors of [2 x double] to determine if -/// the value in the first parameter is less than or equal to the -/// corresponding value in the second parameter. -/// -/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCMPLESD / CMPLESD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. The lower double-precision value is -/// compared to the lower double-precision value of \a __b. -/// \param __b -/// A 128-bit vector of [2 x double]. The lower double-precision value is -/// compared to the lower double-precision value of \a __a. -/// \returns A 128-bit vector. The lower 64 bits contains the comparison -/// results. The upper 64 bits are copied from the upper 64 bits of \a __a. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_cmple_sd(__m128d __a, __m128d __b) -{ - return (__m128d)__builtin_ia32_cmplesd((__v2df)__a, (__v2df)__b); -} - -/// Compares the lower double-precision floating-point values in each of -/// the two 128-bit floating-point vectors of [2 x double] to determine if -/// the value in the first parameter is greater than the corresponding value -/// in the second parameter. -/// -/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCMPLTSD / CMPLTSD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. The lower double-precision value is -/// compared to the lower double-precision value of \a __b. -/// \param __b -/// A 128-bit vector of [2 x double]. The lower double-precision value is -/// compared to the lower double-precision value of \a __a. -/// \returns A 128-bit vector. The lower 64 bits contains the comparison -/// results. The upper 64 bits are copied from the upper 64 bits of \a __a. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_cmpgt_sd(__m128d __a, __m128d __b) -{ - __m128d __c = __builtin_ia32_cmpltsd((__v2df)__b, (__v2df)__a); - return __extension__ (__m128d) { __c[0], __a[1] }; -} - -/// Compares the lower double-precision floating-point values in each of -/// the two 128-bit floating-point vectors of [2 x double] to determine if -/// the value in the first parameter is greater than or equal to the -/// corresponding value in the second parameter. -/// -/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCMPLESD / CMPLESD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. The lower double-precision value is -/// compared to the lower double-precision value of \a __b. -/// \param __b -/// A 128-bit vector of [2 x double]. The lower double-precision value is -/// compared to the lower double-precision value of \a __a. -/// \returns A 128-bit vector. The lower 64 bits contains the comparison -/// results. The upper 64 bits are copied from the upper 64 bits of \a __a. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_cmpge_sd(__m128d __a, __m128d __b) -{ - __m128d __c = __builtin_ia32_cmplesd((__v2df)__b, (__v2df)__a); - return __extension__ (__m128d) { __c[0], __a[1] }; -} - -/// Compares the lower double-precision floating-point values in each of -/// the two 128-bit floating-point vectors of [2 x double] to determine if -/// the value in the first parameter is "ordered" with respect to the -/// corresponding value in the second parameter. -/// -/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair -/// of double-precision values are "ordered" with respect to each other if -/// neither value is a NaN. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCMPORDSD / CMPORDSD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. The lower double-precision value is -/// compared to the lower double-precision value of \a __b. -/// \param __b -/// A 128-bit vector of [2 x double]. The lower double-precision value is -/// compared to the lower double-precision value of \a __a. -/// \returns A 128-bit vector. The lower 64 bits contains the comparison -/// results. The upper 64 bits are copied from the upper 64 bits of \a __a. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_cmpord_sd(__m128d __a, __m128d __b) -{ - return (__m128d)__builtin_ia32_cmpordsd((__v2df)__a, (__v2df)__b); -} - -/// Compares the lower double-precision floating-point values in each of -/// the two 128-bit floating-point vectors of [2 x double] to determine if -/// the value in the first parameter is "unordered" with respect to the -/// corresponding value in the second parameter. -/// -/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. A pair -/// of double-precision values are "unordered" with respect to each other if -/// one or both values are NaN. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCMPUNORDSD / CMPUNORDSD -/// instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. The lower double-precision value is -/// compared to the lower double-precision value of \a __b. -/// \param __b -/// A 128-bit vector of [2 x double]. The lower double-precision value is -/// compared to the lower double-precision value of \a __a. -/// \returns A 128-bit vector. The lower 64 bits contains the comparison -/// results. The upper 64 bits are copied from the upper 64 bits of \a __a. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_cmpunord_sd(__m128d __a, __m128d __b) -{ - return (__m128d)__builtin_ia32_cmpunordsd((__v2df)__a, (__v2df)__b); -} - -/// Compares the lower double-precision floating-point values in each of -/// the two 128-bit floating-point vectors of [2 x double] to determine if -/// the value in the first parameter is unequal to the corresponding value in -/// the second parameter. -/// -/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCMPNEQSD / CMPNEQSD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. The lower double-precision value is -/// compared to the lower double-precision value of \a __b. -/// \param __b -/// A 128-bit vector of [2 x double]. The lower double-precision value is -/// compared to the lower double-precision value of \a __a. -/// \returns A 128-bit vector. The lower 64 bits contains the comparison -/// results. The upper 64 bits are copied from the upper 64 bits of \a __a. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_cmpneq_sd(__m128d __a, __m128d __b) -{ - return (__m128d)__builtin_ia32_cmpneqsd((__v2df)__a, (__v2df)__b); -} - -/// Compares the lower double-precision floating-point values in each of -/// the two 128-bit floating-point vectors of [2 x double] to determine if -/// the value in the first parameter is not less than the corresponding -/// value in the second parameter. -/// -/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCMPNLTSD / CMPNLTSD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. The lower double-precision value is -/// compared to the lower double-precision value of \a __b. -/// \param __b -/// A 128-bit vector of [2 x double]. The lower double-precision value is -/// compared to the lower double-precision value of \a __a. -/// \returns A 128-bit vector. The lower 64 bits contains the comparison -/// results. The upper 64 bits are copied from the upper 64 bits of \a __a. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_cmpnlt_sd(__m128d __a, __m128d __b) -{ - return (__m128d)__builtin_ia32_cmpnltsd((__v2df)__a, (__v2df)__b); -} - -/// Compares the lower double-precision floating-point values in each of -/// the two 128-bit floating-point vectors of [2 x double] to determine if -/// the value in the first parameter is not less than or equal to the -/// corresponding value in the second parameter. -/// -/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCMPNLESD / CMPNLESD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. The lower double-precision value is -/// compared to the lower double-precision value of \a __b. -/// \param __b -/// A 128-bit vector of [2 x double]. The lower double-precision value is -/// compared to the lower double-precision value of \a __a. -/// \returns A 128-bit vector. The lower 64 bits contains the comparison -/// results. The upper 64 bits are copied from the upper 64 bits of \a __a. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_cmpnle_sd(__m128d __a, __m128d __b) -{ - return (__m128d)__builtin_ia32_cmpnlesd((__v2df)__a, (__v2df)__b); -} - -/// Compares the lower double-precision floating-point values in each of -/// the two 128-bit floating-point vectors of [2 x double] to determine if -/// the value in the first parameter is not greater than the corresponding -/// value in the second parameter. -/// -/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCMPNLTSD / CMPNLTSD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. The lower double-precision value is -/// compared to the lower double-precision value of \a __b. -/// \param __b -/// A 128-bit vector of [2 x double]. The lower double-precision value is -/// compared to the lower double-precision value of \a __a. -/// \returns A 128-bit vector. The lower 64 bits contains the comparison -/// results. The upper 64 bits are copied from the upper 64 bits of \a __a. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_cmpngt_sd(__m128d __a, __m128d __b) -{ - __m128d __c = __builtin_ia32_cmpnltsd((__v2df)__b, (__v2df)__a); - return __extension__ (__m128d) { __c[0], __a[1] }; -} - -/// Compares the lower double-precision floating-point values in each of -/// the two 128-bit floating-point vectors of [2 x double] to determine if -/// the value in the first parameter is not greater than or equal to the -/// corresponding value in the second parameter. -/// -/// The comparison yields 0x0 for false, 0xFFFFFFFFFFFFFFFF for true. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCMPNLESD / CMPNLESD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. The lower double-precision value is -/// compared to the lower double-precision value of \a __b. -/// \param __b -/// A 128-bit vector of [2 x double]. The lower double-precision value is -/// compared to the lower double-precision value of \a __a. -/// \returns A 128-bit vector. The lower 64 bits contains the comparison -/// results. The upper 64 bits are copied from the upper 64 bits of \a __a. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_cmpnge_sd(__m128d __a, __m128d __b) -{ - __m128d __c = __builtin_ia32_cmpnlesd((__v2df)__b, (__v2df)__a); - return __extension__ (__m128d) { __c[0], __a[1] }; -} - -/// Compares the lower double-precision floating-point values in each of -/// the two 128-bit floating-point vectors of [2 x double] for equality. -/// -/// The comparison yields 0 for false, 1 for true. If either of the two -/// lower double-precision values is NaN, 0 is returned. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCOMISD / COMISD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. The lower double-precision value is -/// compared to the lower double-precision value of \a __b. -/// \param __b -/// A 128-bit vector of [2 x double]. The lower double-precision value is -/// compared to the lower double-precision value of \a __a. -/// \returns An integer containing the comparison results. If either of the two -/// lower double-precision values is NaN, 0 is returned. -static __inline__ int __DEFAULT_FN_ATTRS -_mm_comieq_sd(__m128d __a, __m128d __b) -{ - return __builtin_ia32_comisdeq((__v2df)__a, (__v2df)__b); -} - -/// Compares the lower double-precision floating-point values in each of -/// the two 128-bit floating-point vectors of [2 x double] to determine if -/// the value in the first parameter is less than the corresponding value in -/// the second parameter. -/// -/// The comparison yields 0 for false, 1 for true. If either of the two -/// lower double-precision values is NaN, 0 is returned. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCOMISD / COMISD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. The lower double-precision value is -/// compared to the lower double-precision value of \a __b. -/// \param __b -/// A 128-bit vector of [2 x double]. The lower double-precision value is -/// compared to the lower double-precision value of \a __a. -/// \returns An integer containing the comparison results. If either of the two -/// lower double-precision values is NaN, 0 is returned. -static __inline__ int __DEFAULT_FN_ATTRS -_mm_comilt_sd(__m128d __a, __m128d __b) -{ - return __builtin_ia32_comisdlt((__v2df)__a, (__v2df)__b); -} - -/// Compares the lower double-precision floating-point values in each of -/// the two 128-bit floating-point vectors of [2 x double] to determine if -/// the value in the first parameter is less than or equal to the -/// corresponding value in the second parameter. -/// -/// The comparison yields 0 for false, 1 for true. If either of the two -/// lower double-precision values is NaN, 0 is returned. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCOMISD / COMISD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. The lower double-precision value is -/// compared to the lower double-precision value of \a __b. -/// \param __b -/// A 128-bit vector of [2 x double]. The lower double-precision value is -/// compared to the lower double-precision value of \a __a. -/// \returns An integer containing the comparison results. If either of the two -/// lower double-precision values is NaN, 0 is returned. -static __inline__ int __DEFAULT_FN_ATTRS -_mm_comile_sd(__m128d __a, __m128d __b) -{ - return __builtin_ia32_comisdle((__v2df)__a, (__v2df)__b); -} - -/// Compares the lower double-precision floating-point values in each of -/// the two 128-bit floating-point vectors of [2 x double] to determine if -/// the value in the first parameter is greater than the corresponding value -/// in the second parameter. -/// -/// The comparison yields 0 for false, 1 for true. If either of the two -/// lower double-precision values is NaN, 0 is returned. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCOMISD / COMISD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. The lower double-precision value is -/// compared to the lower double-precision value of \a __b. -/// \param __b -/// A 128-bit vector of [2 x double]. The lower double-precision value is -/// compared to the lower double-precision value of \a __a. -/// \returns An integer containing the comparison results. If either of the two -/// lower double-precision values is NaN, 0 is returned. -static __inline__ int __DEFAULT_FN_ATTRS -_mm_comigt_sd(__m128d __a, __m128d __b) -{ - return __builtin_ia32_comisdgt((__v2df)__a, (__v2df)__b); -} - -/// Compares the lower double-precision floating-point values in each of -/// the two 128-bit floating-point vectors of [2 x double] to determine if -/// the value in the first parameter is greater than or equal to the -/// corresponding value in the second parameter. -/// -/// The comparison yields 0 for false, 1 for true. If either of the two -/// lower double-precision values is NaN, 0 is returned. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCOMISD / COMISD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. The lower double-precision value is -/// compared to the lower double-precision value of \a __b. -/// \param __b -/// A 128-bit vector of [2 x double]. The lower double-precision value is -/// compared to the lower double-precision value of \a __a. -/// \returns An integer containing the comparison results. If either of the two -/// lower double-precision values is NaN, 0 is returned. -static __inline__ int __DEFAULT_FN_ATTRS -_mm_comige_sd(__m128d __a, __m128d __b) -{ - return __builtin_ia32_comisdge((__v2df)__a, (__v2df)__b); -} - -/// Compares the lower double-precision floating-point values in each of -/// the two 128-bit floating-point vectors of [2 x double] to determine if -/// the value in the first parameter is unequal to the corresponding value in -/// the second parameter. -/// -/// The comparison yields 0 for false, 1 for true. If either of the two -/// lower double-precision values is NaN, 1 is returned. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCOMISD / COMISD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. The lower double-precision value is -/// compared to the lower double-precision value of \a __b. -/// \param __b -/// A 128-bit vector of [2 x double]. The lower double-precision value is -/// compared to the lower double-precision value of \a __a. -/// \returns An integer containing the comparison results. If either of the two -/// lower double-precision values is NaN, 1 is returned. -static __inline__ int __DEFAULT_FN_ATTRS -_mm_comineq_sd(__m128d __a, __m128d __b) -{ - return __builtin_ia32_comisdneq((__v2df)__a, (__v2df)__b); -} - -/// Compares the lower double-precision floating-point values in each of -/// the two 128-bit floating-point vectors of [2 x double] for equality. The -/// comparison yields 0 for false, 1 for true. -/// -/// If either of the two lower double-precision values is NaN, 0 is returned. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VUCOMISD / UCOMISD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. The lower double-precision value is -/// compared to the lower double-precision value of \a __b. -/// \param __b -/// A 128-bit vector of [2 x double]. The lower double-precision value is -/// compared to the lower double-precision value of \a __a. -/// \returns An integer containing the comparison results. If either of the two -/// lower double-precision values is NaN, 0 is returned. -static __inline__ int __DEFAULT_FN_ATTRS -_mm_ucomieq_sd(__m128d __a, __m128d __b) -{ - return __builtin_ia32_ucomisdeq((__v2df)__a, (__v2df)__b); -} - -/// Compares the lower double-precision floating-point values in each of -/// the two 128-bit floating-point vectors of [2 x double] to determine if -/// the value in the first parameter is less than the corresponding value in -/// the second parameter. -/// -/// The comparison yields 0 for false, 1 for true. If either of the two lower -/// double-precision values is NaN, 0 is returned. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VUCOMISD / UCOMISD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. The lower double-precision value is -/// compared to the lower double-precision value of \a __b. -/// \param __b -/// A 128-bit vector of [2 x double]. The lower double-precision value is -/// compared to the lower double-precision value of \a __a. -/// \returns An integer containing the comparison results. If either of the two -/// lower double-precision values is NaN, 0 is returned. -static __inline__ int __DEFAULT_FN_ATTRS -_mm_ucomilt_sd(__m128d __a, __m128d __b) -{ - return __builtin_ia32_ucomisdlt((__v2df)__a, (__v2df)__b); -} - -/// Compares the lower double-precision floating-point values in each of -/// the two 128-bit floating-point vectors of [2 x double] to determine if -/// the value in the first parameter is less than or equal to the -/// corresponding value in the second parameter. -/// -/// The comparison yields 0 for false, 1 for true. If either of the two lower -/// double-precision values is NaN, 0 is returned. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VUCOMISD / UCOMISD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. The lower double-precision value is -/// compared to the lower double-precision value of \a __b. -/// \param __b -/// A 128-bit vector of [2 x double]. The lower double-precision value is -/// compared to the lower double-precision value of \a __a. -/// \returns An integer containing the comparison results. If either of the two -/// lower double-precision values is NaN, 0 is returned. -static __inline__ int __DEFAULT_FN_ATTRS -_mm_ucomile_sd(__m128d __a, __m128d __b) -{ - return __builtin_ia32_ucomisdle((__v2df)__a, (__v2df)__b); -} - -/// Compares the lower double-precision floating-point values in each of -/// the two 128-bit floating-point vectors of [2 x double] to determine if -/// the value in the first parameter is greater than the corresponding value -/// in the second parameter. -/// -/// The comparison yields 0 for false, 1 for true. If either of the two lower -/// double-precision values is NaN, 0 is returned. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VUCOMISD / UCOMISD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. The lower double-precision value is -/// compared to the lower double-precision value of \a __b. -/// \param __b -/// A 128-bit vector of [2 x double]. The lower double-precision value is -/// compared to the lower double-precision value of \a __a. -/// \returns An integer containing the comparison results. If either of the two -/// lower double-precision values is NaN, 0 is returned. -static __inline__ int __DEFAULT_FN_ATTRS -_mm_ucomigt_sd(__m128d __a, __m128d __b) -{ - return __builtin_ia32_ucomisdgt((__v2df)__a, (__v2df)__b); -} - -/// Compares the lower double-precision floating-point values in each of -/// the two 128-bit floating-point vectors of [2 x double] to determine if -/// the value in the first parameter is greater than or equal to the -/// corresponding value in the second parameter. -/// -/// The comparison yields 0 for false, 1 for true. If either of the two -/// lower double-precision values is NaN, 0 is returned. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VUCOMISD / UCOMISD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. The lower double-precision value is -/// compared to the lower double-precision value of \a __b. -/// \param __b -/// A 128-bit vector of [2 x double]. The lower double-precision value is -/// compared to the lower double-precision value of \a __a. -/// \returns An integer containing the comparison results. If either of the two -/// lower double-precision values is NaN, 0 is returned. -static __inline__ int __DEFAULT_FN_ATTRS -_mm_ucomige_sd(__m128d __a, __m128d __b) -{ - return __builtin_ia32_ucomisdge((__v2df)__a, (__v2df)__b); -} - -/// Compares the lower double-precision floating-point values in each of -/// the two 128-bit floating-point vectors of [2 x double] to determine if -/// the value in the first parameter is unequal to the corresponding value in -/// the second parameter. -/// -/// The comparison yields 0 for false, 1 for true. If either of the two lower -/// double-precision values is NaN, 1 is returned. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VUCOMISD / UCOMISD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. The lower double-precision value is -/// compared to the lower double-precision value of \a __b. -/// \param __b -/// A 128-bit vector of [2 x double]. The lower double-precision value is -/// compared to the lower double-precision value of \a __a. -/// \returns An integer containing the comparison result. If either of the two -/// lower double-precision values is NaN, 1 is returned. -static __inline__ int __DEFAULT_FN_ATTRS -_mm_ucomineq_sd(__m128d __a, __m128d __b) -{ - return __builtin_ia32_ucomisdneq((__v2df)__a, (__v2df)__b); -} - -/// Converts the two double-precision floating-point elements of a -/// 128-bit vector of [2 x double] into two single-precision floating-point -/// values, returned in the lower 64 bits of a 128-bit vector of [4 x float]. -/// The upper 64 bits of the result vector are set to zero. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCVTPD2PS / CVTPD2PS instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. -/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the -/// converted values. The upper 64 bits are set to zero. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_cvtpd_ps(__m128d __a) -{ - return __builtin_ia32_cvtpd2ps((__v2df)__a); -} - -/// Converts the lower two single-precision floating-point elements of a -/// 128-bit vector of [4 x float] into two double-precision floating-point -/// values, returned in a 128-bit vector of [2 x double]. The upper two -/// elements of the input vector are unused. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCVTPS2PD / CVTPS2PD instruction. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. The lower two single-precision -/// floating-point elements are converted to double-precision values. The -/// upper two elements are unused. -/// \returns A 128-bit vector of [2 x double] containing the converted values. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_cvtps_pd(__m128 __a) -{ - return (__m128d) __builtin_convertvector( - __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 1), __v2df); -} - -/// Converts the lower two integer elements of a 128-bit vector of -/// [4 x i32] into two double-precision floating-point values, returned in a -/// 128-bit vector of [2 x double]. -/// -/// The upper two elements of the input vector are unused. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCVTDQ2PD / CVTDQ2PD instruction. -/// -/// \param __a -/// A 128-bit integer vector of [4 x i32]. The lower two integer elements are -/// converted to double-precision values. -/// -/// The upper two elements are unused. -/// \returns A 128-bit vector of [2 x double] containing the converted values. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_cvtepi32_pd(__m128i __a) -{ - return (__m128d) __builtin_convertvector( - __builtin_shufflevector((__v4si)__a, (__v4si)__a, 0, 1), __v2df); -} - -/// Converts the two double-precision floating-point elements of a -/// 128-bit vector of [2 x double] into two signed 32-bit integer values, -/// returned in the lower 64 bits of a 128-bit vector of [4 x i32]. The upper -/// 64 bits of the result vector are set to zero. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCVTPD2DQ / CVTPD2DQ instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. -/// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the -/// converted values. The upper 64 bits are set to zero. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_cvtpd_epi32(__m128d __a) -{ - return __builtin_ia32_cvtpd2dq((__v2df)__a); -} - -/// Converts the low-order element of a 128-bit vector of [2 x double] -/// into a 32-bit signed integer value. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCVTSD2SI / CVTSD2SI instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the -/// conversion. -/// \returns A 32-bit signed integer containing the converted value. -static __inline__ int __DEFAULT_FN_ATTRS -_mm_cvtsd_si32(__m128d __a) -{ - return __builtin_ia32_cvtsd2si((__v2df)__a); -} - -/// Converts the lower double-precision floating-point element of a -/// 128-bit vector of [2 x double], in the second parameter, into a -/// single-precision floating-point value, returned in the lower 32 bits of a -/// 128-bit vector of [4 x float]. The upper 96 bits of the result vector are -/// copied from the upper 96 bits of the first parameter. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCVTSD2SS / CVTSD2SS instruction. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. The upper 96 bits of this parameter are -/// copied to the upper 96 bits of the result. -/// \param __b -/// A 128-bit vector of [2 x double]. The lower double-precision -/// floating-point element is used in the conversion. -/// \returns A 128-bit vector of [4 x float]. The lower 32 bits contain the -/// converted value from the second parameter. The upper 96 bits are copied -/// from the upper 96 bits of the first parameter. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_cvtsd_ss(__m128 __a, __m128d __b) -{ - return (__m128)__builtin_ia32_cvtsd2ss((__v4sf)__a, (__v2df)__b); -} - -/// Converts a 32-bit signed integer value, in the second parameter, into -/// a double-precision floating-point value, returned in the lower 64 bits of -/// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector -/// are copied from the upper 64 bits of the first parameter. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCVTSI2SD / CVTSI2SD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are -/// copied to the upper 64 bits of the result. -/// \param __b -/// A 32-bit signed integer containing the value to be converted. -/// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the -/// converted value from the second parameter. The upper 64 bits are copied -/// from the upper 64 bits of the first parameter. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_cvtsi32_sd(__m128d __a, int __b) -{ - __a[0] = __b; - return __a; -} - -/// Converts the lower single-precision floating-point element of a -/// 128-bit vector of [4 x float], in the second parameter, into a -/// double-precision floating-point value, returned in the lower 64 bits of -/// a 128-bit vector of [2 x double]. The upper 64 bits of the result vector -/// are copied from the upper 64 bits of the first parameter. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCVTSS2SD / CVTSS2SD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. The upper 64 bits of this parameter are -/// copied to the upper 64 bits of the result. -/// \param __b -/// A 128-bit vector of [4 x float]. The lower single-precision -/// floating-point element is used in the conversion. -/// \returns A 128-bit vector of [2 x double]. The lower 64 bits contain the -/// converted value from the second parameter. The upper 64 bits are copied -/// from the upper 64 bits of the first parameter. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_cvtss_sd(__m128d __a, __m128 __b) -{ - __a[0] = __b[0]; - return __a; -} - -/// Converts the two double-precision floating-point elements of a -/// 128-bit vector of [2 x double] into two signed 32-bit integer values, -/// returned in the lower 64 bits of a 128-bit vector of [4 x i32]. -/// -/// If the result of either conversion is inexact, the result is truncated -/// (rounded towards zero) regardless of the current MXCSR setting. The upper -/// 64 bits of the result vector are set to zero. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCVTTPD2DQ / CVTTPD2DQ -/// instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. -/// \returns A 128-bit vector of [4 x i32] whose lower 64 bits contain the -/// converted values. The upper 64 bits are set to zero. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_cvttpd_epi32(__m128d __a) -{ - return (__m128i)__builtin_ia32_cvttpd2dq((__v2df)__a); -} - -/// Converts the low-order element of a [2 x double] vector into a 32-bit -/// signed integer value, truncating the result when it is inexact. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCVTTSD2SI / CVTTSD2SI -/// instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the -/// conversion. -/// \returns A 32-bit signed integer containing the converted value. -static __inline__ int __DEFAULT_FN_ATTRS -_mm_cvttsd_si32(__m128d __a) -{ - return __builtin_ia32_cvttsd2si((__v2df)__a); -} - -/// Converts the two double-precision floating-point elements of a -/// 128-bit vector of [2 x double] into two signed 32-bit integer values, -/// returned in a 64-bit vector of [2 x i32]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the CVTPD2PI instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. -/// \returns A 64-bit vector of [2 x i32] containing the converted values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX -_mm_cvtpd_pi32(__m128d __a) -{ - return (__m64)__builtin_ia32_cvtpd2pi((__v2df)__a); -} - -/// Converts the two double-precision floating-point elements of a -/// 128-bit vector of [2 x double] into two signed 32-bit integer values, -/// returned in a 64-bit vector of [2 x i32]. -/// -/// If the result of either conversion is inexact, the result is truncated -/// (rounded towards zero) regardless of the current MXCSR setting. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the CVTTPD2PI instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. -/// \returns A 64-bit vector of [2 x i32] containing the converted values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX -_mm_cvttpd_pi32(__m128d __a) -{ - return (__m64)__builtin_ia32_cvttpd2pi((__v2df)__a); -} - -/// Converts the two signed 32-bit integer elements of a 64-bit vector of -/// [2 x i32] into two double-precision floating-point values, returned in a -/// 128-bit vector of [2 x double]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the CVTPI2PD instruction. -/// -/// \param __a -/// A 64-bit vector of [2 x i32]. -/// \returns A 128-bit vector of [2 x double] containing the converted values. -static __inline__ __m128d __DEFAULT_FN_ATTRS_MMX -_mm_cvtpi32_pd(__m64 __a) -{ - return __builtin_ia32_cvtpi2pd((__v2si)__a); -} - -/// Returns the low-order element of a 128-bit vector of [2 x double] as -/// a double-precision floating-point value. -/// -/// \headerfile -/// -/// This intrinsic has no corresponding instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. The lower 64 bits are returned. -/// \returns A double-precision floating-point value copied from the lower 64 -/// bits of \a __a. -static __inline__ double __DEFAULT_FN_ATTRS -_mm_cvtsd_f64(__m128d __a) -{ - return __a[0]; -} - -/// Loads a 128-bit floating-point vector of [2 x double] from an aligned -/// memory location. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVAPD / MOVAPD instruction. -/// -/// \param __dp -/// A pointer to a 128-bit memory location. The address of the memory -/// location has to be 16-byte aligned. -/// \returns A 128-bit vector of [2 x double] containing the loaded values. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_load_pd(double const *__dp) -{ - return *(const __m128d*)__dp; -} - -/// Loads a double-precision floating-point value from a specified memory -/// location and duplicates it to both vector elements of a 128-bit vector of -/// [2 x double]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVDDUP / MOVDDUP instruction. -/// -/// \param __dp -/// A pointer to a memory location containing a double-precision value. -/// \returns A 128-bit vector of [2 x double] containing the loaded and -/// duplicated values. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_load1_pd(double const *__dp) -{ - struct __mm_load1_pd_struct { - double __u; - } __attribute__((__packed__, __may_alias__)); - double __u = ((const struct __mm_load1_pd_struct*)__dp)->__u; - return __extension__ (__m128d){ __u, __u }; -} - -#define _mm_load_pd1(dp) _mm_load1_pd(dp) - -/// Loads two double-precision values, in reverse order, from an aligned -/// memory location into a 128-bit vector of [2 x double]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVAPD / MOVAPD instruction + -/// needed shuffling instructions. In AVX mode, the shuffling may be combined -/// with the \c VMOVAPD, resulting in only a \c VPERMILPD instruction. -/// -/// \param __dp -/// A 16-byte aligned pointer to an array of double-precision values to be -/// loaded in reverse order. -/// \returns A 128-bit vector of [2 x double] containing the reversed loaded -/// values. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_loadr_pd(double const *__dp) -{ - __m128d __u = *(const __m128d*)__dp; - return __builtin_shufflevector((__v2df)__u, (__v2df)__u, 1, 0); -} - -/// Loads a 128-bit floating-point vector of [2 x double] from an -/// unaligned memory location. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVUPD / MOVUPD instruction. -/// -/// \param __dp -/// A pointer to a 128-bit memory location. The address of the memory -/// location does not have to be aligned. -/// \returns A 128-bit vector of [2 x double] containing the loaded values. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_loadu_pd(double const *__dp) -{ - struct __loadu_pd { - __m128d_u __v; - } __attribute__((__packed__, __may_alias__)); - return ((const struct __loadu_pd*)__dp)->__v; -} - -/// Loads a 64-bit integer value to the low element of a 128-bit integer -/// vector and clears the upper element. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVQ / MOVQ instruction. -/// -/// \param __a -/// A pointer to a 64-bit memory location. The address of the memory -/// location does not have to be aligned. -/// \returns A 128-bit vector of [2 x i64] containing the loaded value. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_loadu_si64(void const *__a) -{ - struct __loadu_si64 { - long long __v; - } __attribute__((__packed__, __may_alias__)); - long long __u = ((const struct __loadu_si64*)__a)->__v; - return __extension__ (__m128i)(__v2di){__u, 0LL}; -} - -/// Loads a 32-bit integer value to the low element of a 128-bit integer -/// vector and clears the upper element. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVD / MOVD instruction. -/// -/// \param __a -/// A pointer to a 32-bit memory location. The address of the memory -/// location does not have to be aligned. -/// \returns A 128-bit vector of [4 x i32] containing the loaded value. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_loadu_si32(void const *__a) -{ - struct __loadu_si32 { - int __v; - } __attribute__((__packed__, __may_alias__)); - int __u = ((const struct __loadu_si32*)__a)->__v; - return __extension__ (__m128i)(__v4si){__u, 0, 0, 0}; -} - -/// Loads a 16-bit integer value to the low element of a 128-bit integer -/// vector and clears the upper element. -/// -/// \headerfile -/// -/// This intrinsic does not correspond to a specific instruction. -/// -/// \param __a -/// A pointer to a 16-bit memory location. The address of the memory -/// location does not have to be aligned. -/// \returns A 128-bit vector of [8 x i16] containing the loaded value. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_loadu_si16(void const *__a) -{ - struct __loadu_si16 { - short __v; - } __attribute__((__packed__, __may_alias__)); - short __u = ((const struct __loadu_si16*)__a)->__v; - return __extension__ (__m128i)(__v8hi){__u, 0, 0, 0, 0, 0, 0, 0}; -} - -/// Loads a 64-bit double-precision value to the low element of a -/// 128-bit integer vector and clears the upper element. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVSD / MOVSD instruction. -/// -/// \param __dp -/// A pointer to a memory location containing a double-precision value. -/// The address of the memory location does not have to be aligned. -/// \returns A 128-bit vector of [2 x double] containing the loaded value. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_load_sd(double const *__dp) -{ - struct __mm_load_sd_struct { - double __u; - } __attribute__((__packed__, __may_alias__)); - double __u = ((const struct __mm_load_sd_struct*)__dp)->__u; - return __extension__ (__m128d){ __u, 0 }; -} - -/// Loads a double-precision value into the high-order bits of a 128-bit -/// vector of [2 x double]. The low-order bits are copied from the low-order -/// bits of the first operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVHPD / MOVHPD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. \n -/// Bits [63:0] are written to bits [63:0] of the result. -/// \param __dp -/// A pointer to a 64-bit memory location containing a double-precision -/// floating-point value that is loaded. The loaded value is written to bits -/// [127:64] of the result. The address of the memory location does not have -/// to be aligned. -/// \returns A 128-bit vector of [2 x double] containing the moved values. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_loadh_pd(__m128d __a, double const *__dp) -{ - struct __mm_loadh_pd_struct { - double __u; - } __attribute__((__packed__, __may_alias__)); - double __u = ((const struct __mm_loadh_pd_struct*)__dp)->__u; - return __extension__ (__m128d){ __a[0], __u }; -} - -/// Loads a double-precision value into the low-order bits of a 128-bit -/// vector of [2 x double]. The high-order bits are copied from the -/// high-order bits of the first operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVLPD / MOVLPD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. \n -/// Bits [127:64] are written to bits [127:64] of the result. -/// \param __dp -/// A pointer to a 64-bit memory location containing a double-precision -/// floating-point value that is loaded. The loaded value is written to bits -/// [63:0] of the result. The address of the memory location does not have to -/// be aligned. -/// \returns A 128-bit vector of [2 x double] containing the moved values. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_loadl_pd(__m128d __a, double const *__dp) -{ - struct __mm_loadl_pd_struct { - double __u; - } __attribute__((__packed__, __may_alias__)); - double __u = ((const struct __mm_loadl_pd_struct*)__dp)->__u; - return __extension__ (__m128d){ __u, __a[1] }; -} - -/// Constructs a 128-bit floating-point vector of [2 x double] with -/// unspecified content. This could be used as an argument to another -/// intrinsic function where the argument is required but the value is not -/// actually used. -/// -/// \headerfile -/// -/// This intrinsic has no corresponding instruction. -/// -/// \returns A 128-bit floating-point vector of [2 x double] with unspecified -/// content. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_undefined_pd(void) -{ - return (__m128d)__builtin_ia32_undef128(); -} - -/// Constructs a 128-bit floating-point vector of [2 x double]. The lower -/// 64 bits of the vector are initialized with the specified double-precision -/// floating-point value. The upper 64 bits are set to zero. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVQ / MOVQ instruction. -/// -/// \param __w -/// A double-precision floating-point value used to initialize the lower 64 -/// bits of the result. -/// \returns An initialized 128-bit floating-point vector of [2 x double]. The -/// lower 64 bits contain the value of the parameter. The upper 64 bits are -/// set to zero. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_set_sd(double __w) -{ - return __extension__ (__m128d){ __w, 0 }; -} - -/// Constructs a 128-bit floating-point vector of [2 x double], with each -/// of the two double-precision floating-point vector elements set to the -/// specified double-precision floating-point value. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVDDUP / MOVLHPS instruction. -/// -/// \param __w -/// A double-precision floating-point value used to initialize each vector -/// element of the result. -/// \returns An initialized 128-bit floating-point vector of [2 x double]. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_set1_pd(double __w) -{ - return __extension__ (__m128d){ __w, __w }; -} - -/// Constructs a 128-bit floating-point vector of [2 x double], with each -/// of the two double-precision floating-point vector elements set to the -/// specified double-precision floating-point value. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVDDUP / MOVLHPS instruction. -/// -/// \param __w -/// A double-precision floating-point value used to initialize each vector -/// element of the result. -/// \returns An initialized 128-bit floating-point vector of [2 x double]. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_set_pd1(double __w) -{ - return _mm_set1_pd(__w); -} - -/// Constructs a 128-bit floating-point vector of [2 x double] -/// initialized with the specified double-precision floating-point values. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VUNPCKLPD / UNPCKLPD instruction. -/// -/// \param __w -/// A double-precision floating-point value used to initialize the upper 64 -/// bits of the result. -/// \param __x -/// A double-precision floating-point value used to initialize the lower 64 -/// bits of the result. -/// \returns An initialized 128-bit floating-point vector of [2 x double]. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_set_pd(double __w, double __x) -{ - return __extension__ (__m128d){ __x, __w }; -} - -/// Constructs a 128-bit floating-point vector of [2 x double], -/// initialized in reverse order with the specified double-precision -/// floating-point values. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VUNPCKLPD / UNPCKLPD instruction. -/// -/// \param __w -/// A double-precision floating-point value used to initialize the lower 64 -/// bits of the result. -/// \param __x -/// A double-precision floating-point value used to initialize the upper 64 -/// bits of the result. -/// \returns An initialized 128-bit floating-point vector of [2 x double]. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_setr_pd(double __w, double __x) -{ - return __extension__ (__m128d){ __w, __x }; -} - -/// Constructs a 128-bit floating-point vector of [2 x double] -/// initialized to zero. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VXORPS / XORPS instruction. -/// -/// \returns An initialized 128-bit floating-point vector of [2 x double] with -/// all elements set to zero. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_setzero_pd(void) -{ - return __extension__ (__m128d){ 0, 0 }; -} - -/// Constructs a 128-bit floating-point vector of [2 x double]. The lower -/// 64 bits are set to the lower 64 bits of the second parameter. The upper -/// 64 bits are set to the upper 64 bits of the first parameter. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VBLENDPD / BLENDPD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. The upper 64 bits are written to the -/// upper 64 bits of the result. -/// \param __b -/// A 128-bit vector of [2 x double]. The lower 64 bits are written to the -/// lower 64 bits of the result. -/// \returns A 128-bit vector of [2 x double] containing the moved values. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_move_sd(__m128d __a, __m128d __b) -{ - __a[0] = __b[0]; - return __a; -} - -/// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a -/// memory location. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVSD / MOVSD instruction. -/// -/// \param __dp -/// A pointer to a 64-bit memory location. -/// \param __a -/// A 128-bit vector of [2 x double] containing the value to be stored. -static __inline__ void __DEFAULT_FN_ATTRS -_mm_store_sd(double *__dp, __m128d __a) -{ - struct __mm_store_sd_struct { - double __u; - } __attribute__((__packed__, __may_alias__)); - ((struct __mm_store_sd_struct*)__dp)->__u = __a[0]; -} - -/// Moves packed double-precision values from a 128-bit vector of -/// [2 x double] to a memory location. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVAPD / MOVAPS instruction. -/// -/// \param __dp -/// A pointer to an aligned memory location that can store two -/// double-precision values. -/// \param __a -/// A packed 128-bit vector of [2 x double] containing the values to be -/// moved. -static __inline__ void __DEFAULT_FN_ATTRS -_mm_store_pd(double *__dp, __m128d __a) -{ - *(__m128d*)__dp = __a; -} - -/// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to -/// the upper and lower 64 bits of a memory location. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the -/// VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS instruction. -/// -/// \param __dp -/// A pointer to a memory location that can store two double-precision -/// values. -/// \param __a -/// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each -/// of the values in \a __dp. -static __inline__ void __DEFAULT_FN_ATTRS -_mm_store1_pd(double *__dp, __m128d __a) -{ - __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0); - _mm_store_pd(__dp, __a); -} - -/// Moves the lower 64 bits of a 128-bit vector of [2 x double] twice to -/// the upper and lower 64 bits of a memory location. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the -/// VMOVDDUP + VMOVAPD / MOVLHPS + MOVAPS instruction. -/// -/// \param __dp -/// A pointer to a memory location that can store two double-precision -/// values. -/// \param __a -/// A 128-bit vector of [2 x double] whose lower 64 bits are copied to each -/// of the values in \a __dp. -static __inline__ void __DEFAULT_FN_ATTRS -_mm_store_pd1(double *__dp, __m128d __a) -{ - _mm_store1_pd(__dp, __a); -} - -/// Stores a 128-bit vector of [2 x double] into an unaligned memory -/// location. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVUPD / MOVUPD instruction. -/// -/// \param __dp -/// A pointer to a 128-bit memory location. The address of the memory -/// location does not have to be aligned. -/// \param __a -/// A 128-bit vector of [2 x double] containing the values to be stored. -static __inline__ void __DEFAULT_FN_ATTRS -_mm_storeu_pd(double *__dp, __m128d __a) -{ - struct __storeu_pd { - __m128d_u __v; - } __attribute__((__packed__, __may_alias__)); - ((struct __storeu_pd*)__dp)->__v = __a; -} - -/// Stores two double-precision values, in reverse order, from a 128-bit -/// vector of [2 x double] to a 16-byte aligned memory location. -/// -/// \headerfile -/// -/// This intrinsic corresponds to a shuffling instruction followed by a -/// VMOVAPD / MOVAPD instruction. -/// -/// \param __dp -/// A pointer to a 16-byte aligned memory location that can store two -/// double-precision values. -/// \param __a -/// A 128-bit vector of [2 x double] containing the values to be reversed and -/// stored. -static __inline__ void __DEFAULT_FN_ATTRS -_mm_storer_pd(double *__dp, __m128d __a) -{ - __a = __builtin_shufflevector((__v2df)__a, (__v2df)__a, 1, 0); - *(__m128d *)__dp = __a; -} - -/// Stores the upper 64 bits of a 128-bit vector of [2 x double] to a -/// memory location. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVHPD / MOVHPD instruction. -/// -/// \param __dp -/// A pointer to a 64-bit memory location. -/// \param __a -/// A 128-bit vector of [2 x double] containing the value to be stored. -static __inline__ void __DEFAULT_FN_ATTRS -_mm_storeh_pd(double *__dp, __m128d __a) -{ - struct __mm_storeh_pd_struct { - double __u; - } __attribute__((__packed__, __may_alias__)); - ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[1]; -} - -/// Stores the lower 64 bits of a 128-bit vector of [2 x double] to a -/// memory location. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVLPD / MOVLPD instruction. -/// -/// \param __dp -/// A pointer to a 64-bit memory location. -/// \param __a -/// A 128-bit vector of [2 x double] containing the value to be stored. -static __inline__ void __DEFAULT_FN_ATTRS -_mm_storel_pd(double *__dp, __m128d __a) -{ - struct __mm_storeh_pd_struct { - double __u; - } __attribute__((__packed__, __may_alias__)); - ((struct __mm_storeh_pd_struct*)__dp)->__u = __a[0]; -} - -/// Adds the corresponding elements of two 128-bit vectors of [16 x i8], -/// saving the lower 8 bits of each sum in the corresponding element of a -/// 128-bit result vector of [16 x i8]. -/// -/// The integer elements of both parameters can be either signed or unsigned. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPADDB / PADDB instruction. -/// -/// \param __a -/// A 128-bit vector of [16 x i8]. -/// \param __b -/// A 128-bit vector of [16 x i8]. -/// \returns A 128-bit vector of [16 x i8] containing the sums of both -/// parameters. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_add_epi8(__m128i __a, __m128i __b) -{ - return (__m128i)((__v16qu)__a + (__v16qu)__b); -} - -/// Adds the corresponding elements of two 128-bit vectors of [8 x i16], -/// saving the lower 16 bits of each sum in the corresponding element of a -/// 128-bit result vector of [8 x i16]. -/// -/// The integer elements of both parameters can be either signed or unsigned. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPADDW / PADDW instruction. -/// -/// \param __a -/// A 128-bit vector of [8 x i16]. -/// \param __b -/// A 128-bit vector of [8 x i16]. -/// \returns A 128-bit vector of [8 x i16] containing the sums of both -/// parameters. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_add_epi16(__m128i __a, __m128i __b) -{ - return (__m128i)((__v8hu)__a + (__v8hu)__b); -} - -/// Adds the corresponding elements of two 128-bit vectors of [4 x i32], -/// saving the lower 32 bits of each sum in the corresponding element of a -/// 128-bit result vector of [4 x i32]. -/// -/// The integer elements of both parameters can be either signed or unsigned. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPADDD / PADDD instruction. -/// -/// \param __a -/// A 128-bit vector of [4 x i32]. -/// \param __b -/// A 128-bit vector of [4 x i32]. -/// \returns A 128-bit vector of [4 x i32] containing the sums of both -/// parameters. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_add_epi32(__m128i __a, __m128i __b) -{ - return (__m128i)((__v4su)__a + (__v4su)__b); -} - -/// Adds two signed or unsigned 64-bit integer values, returning the -/// lower 64 bits of the sum. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PADDQ instruction. -/// -/// \param __a -/// A 64-bit integer. -/// \param __b -/// A 64-bit integer. -/// \returns A 64-bit integer containing the sum of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX -_mm_add_si64(__m64 __a, __m64 __b) -{ - return (__m64)__builtin_ia32_paddq((__v1di)__a, (__v1di)__b); -} - -/// Adds the corresponding elements of two 128-bit vectors of [2 x i64], -/// saving the lower 64 bits of each sum in the corresponding element of a -/// 128-bit result vector of [2 x i64]. -/// -/// The integer elements of both parameters can be either signed or unsigned. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPADDQ / PADDQ instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x i64]. -/// \param __b -/// A 128-bit vector of [2 x i64]. -/// \returns A 128-bit vector of [2 x i64] containing the sums of both -/// parameters. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_add_epi64(__m128i __a, __m128i __b) -{ - return (__m128i)((__v2du)__a + (__v2du)__b); -} - -/// Adds, with saturation, the corresponding elements of two 128-bit -/// signed [16 x i8] vectors, saving each sum in the corresponding element of -/// a 128-bit result vector of [16 x i8]. Positive sums greater than 0x7F are -/// saturated to 0x7F. Negative sums less than 0x80 are saturated to 0x80. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPADDSB / PADDSB instruction. -/// -/// \param __a -/// A 128-bit signed [16 x i8] vector. -/// \param __b -/// A 128-bit signed [16 x i8] vector. -/// \returns A 128-bit signed [16 x i8] vector containing the saturated sums of -/// both parameters. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_adds_epi8(__m128i __a, __m128i __b) -{ -#if (__clang_major__ > 14) - return (__m128i)__builtin_elementwise_add_sat((__v16qs)__a, (__v16qs)__b); -#else - return (__m128i)__builtin_ia32_paddsb128((__v16qi)__a, (__v16qi)__b); -#endif -} - -/// Adds, with saturation, the corresponding elements of two 128-bit -/// signed [8 x i16] vectors, saving each sum in the corresponding element of -/// a 128-bit result vector of [8 x i16]. Positive sums greater than 0x7FFF -/// are saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to -/// 0x8000. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPADDSW / PADDSW instruction. -/// -/// \param __a -/// A 128-bit signed [8 x i16] vector. -/// \param __b -/// A 128-bit signed [8 x i16] vector. -/// \returns A 128-bit signed [8 x i16] vector containing the saturated sums of -/// both parameters. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_adds_epi16(__m128i __a, __m128i __b) -{ -#if (__clang_major__ > 14) - return (__m128i)__builtin_elementwise_add_sat((__v8hi)__a, (__v8hi)__b); -#else - return (__m128i)__builtin_ia32_paddsw128((__v8hi)__a, (__v8hi)__b); -#endif -} - -/// Adds, with saturation, the corresponding elements of two 128-bit -/// unsigned [16 x i8] vectors, saving each sum in the corresponding element -/// of a 128-bit result vector of [16 x i8]. Positive sums greater than 0xFF -/// are saturated to 0xFF. Negative sums are saturated to 0x00. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPADDUSB / PADDUSB instruction. -/// -/// \param __a -/// A 128-bit unsigned [16 x i8] vector. -/// \param __b -/// A 128-bit unsigned [16 x i8] vector. -/// \returns A 128-bit unsigned [16 x i8] vector containing the saturated sums -/// of both parameters. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_adds_epu8(__m128i __a, __m128i __b) -{ -#if (__clang_major__ > 14) - return (__m128i)__builtin_elementwise_add_sat((__v16qu)__a, (__v16qu)__b); -#else - return (__m128i)__builtin_ia32_paddusb128((__v16qi)__a, (__v16qi)__b); -#endif -} - -/// Adds, with saturation, the corresponding elements of two 128-bit -/// unsigned [8 x i16] vectors, saving each sum in the corresponding element -/// of a 128-bit result vector of [8 x i16]. Positive sums greater than -/// 0xFFFF are saturated to 0xFFFF. Negative sums are saturated to 0x0000. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPADDUSB / PADDUSB instruction. -/// -/// \param __a -/// A 128-bit unsigned [8 x i16] vector. -/// \param __b -/// A 128-bit unsigned [8 x i16] vector. -/// \returns A 128-bit unsigned [8 x i16] vector containing the saturated sums -/// of both parameters. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_adds_epu16(__m128i __a, __m128i __b) -{ -#if (__clang_major__ > 14) - return (__m128i)__builtin_elementwise_add_sat((__v8hu)__a, (__v8hu)__b); -#else - return (__m128i)__builtin_ia32_paddusw128((__v8hi)__a, (__v8hi)__b); -#endif -} - -/// Computes the rounded averages of corresponding elements of two -/// 128-bit unsigned [16 x i8] vectors, saving each result in the -/// corresponding element of a 128-bit result vector of [16 x i8]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPAVGB / PAVGB instruction. -/// -/// \param __a -/// A 128-bit unsigned [16 x i8] vector. -/// \param __b -/// A 128-bit unsigned [16 x i8] vector. -/// \returns A 128-bit unsigned [16 x i8] vector containing the rounded -/// averages of both parameters. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_avg_epu8(__m128i __a, __m128i __b) -{ - return (__m128i)__builtin_ia32_pavgb128((__v16qi)__a, (__v16qi)__b); -} - -/// Computes the rounded averages of corresponding elements of two -/// 128-bit unsigned [8 x i16] vectors, saving each result in the -/// corresponding element of a 128-bit result vector of [8 x i16]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPAVGW / PAVGW instruction. -/// -/// \param __a -/// A 128-bit unsigned [8 x i16] vector. -/// \param __b -/// A 128-bit unsigned [8 x i16] vector. -/// \returns A 128-bit unsigned [8 x i16] vector containing the rounded -/// averages of both parameters. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_avg_epu16(__m128i __a, __m128i __b) -{ - return (__m128i)__builtin_ia32_pavgw128((__v8hi)__a, (__v8hi)__b); -} - -/// Multiplies the corresponding elements of two 128-bit signed [8 x i16] -/// vectors, producing eight intermediate 32-bit signed integer products, and -/// adds the consecutive pairs of 32-bit products to form a 128-bit signed -/// [4 x i32] vector. -/// -/// For example, bits [15:0] of both parameters are multiplied producing a -/// 32-bit product, bits [31:16] of both parameters are multiplied producing -/// a 32-bit product, and the sum of those two products becomes bits [31:0] -/// of the result. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPMADDWD / PMADDWD instruction. -/// -/// \param __a -/// A 128-bit signed [8 x i16] vector. -/// \param __b -/// A 128-bit signed [8 x i16] vector. -/// \returns A 128-bit signed [4 x i32] vector containing the sums of products -/// of both parameters. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_madd_epi16(__m128i __a, __m128i __b) -{ - return (__m128i)__builtin_ia32_pmaddwd128((__v8hi)__a, (__v8hi)__b); -} - -/// Compares corresponding elements of two 128-bit signed [8 x i16] -/// vectors, saving the greater value from each comparison in the -/// corresponding element of a 128-bit result vector of [8 x i16]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPMAXSW / PMAXSW instruction. -/// -/// \param __a -/// A 128-bit signed [8 x i16] vector. -/// \param __b -/// A 128-bit signed [8 x i16] vector. -/// \returns A 128-bit signed [8 x i16] vector containing the greater value of -/// each comparison. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_max_epi16(__m128i __a, __m128i __b) -{ -#if (__clang_major__ < 14) - return (__m128i)__builtin_ia32_pmaxsw128((__v8hi)__a, (__v8hi)__b); -#else - return (__m128i)__builtin_elementwise_max((__v8hi)__a, (__v8hi)__b); -#endif -} - -/// Compares corresponding elements of two 128-bit unsigned [16 x i8] -/// vectors, saving the greater value from each comparison in the -/// corresponding element of a 128-bit result vector of [16 x i8]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPMAXUB / PMAXUB instruction. -/// -/// \param __a -/// A 128-bit unsigned [16 x i8] vector. -/// \param __b -/// A 128-bit unsigned [16 x i8] vector. -/// \returns A 128-bit unsigned [16 x i8] vector containing the greater value of -/// each comparison. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_max_epu8(__m128i __a, __m128i __b) -{ -#if (__clang_major__ < 14) - return (__m128i)__builtin_ia32_pmaxub128((__v16qi)__a, (__v16qi)__b); -#else - return (__m128i)__builtin_elementwise_max((__v16qu)__a, (__v16qu)__b); -#endif -} - -/// Compares corresponding elements of two 128-bit signed [8 x i16] -/// vectors, saving the smaller value from each comparison in the -/// corresponding element of a 128-bit result vector of [8 x i16]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPMINSW / PMINSW instruction. -/// -/// \param __a -/// A 128-bit signed [8 x i16] vector. -/// \param __b -/// A 128-bit signed [8 x i16] vector. -/// \returns A 128-bit signed [8 x i16] vector containing the smaller value of -/// each comparison. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_min_epi16(__m128i __a, __m128i __b) -{ -#if (__clang_major__ < 14) - return (__m128i)__builtin_ia32_pminsw128((__v8hi)__a, (__v8hi)__b); -#else - return (__m128i)__builtin_elementwise_min((__v8hi)__a, (__v8hi)__b); -#endif -} - -/// Compares corresponding elements of two 128-bit unsigned [16 x i8] -/// vectors, saving the smaller value from each comparison in the -/// corresponding element of a 128-bit result vector of [16 x i8]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPMINUB / PMINUB instruction. -/// -/// \param __a -/// A 128-bit unsigned [16 x i8] vector. -/// \param __b -/// A 128-bit unsigned [16 x i8] vector. -/// \returns A 128-bit unsigned [16 x i8] vector containing the smaller value of -/// each comparison. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_min_epu8(__m128i __a, __m128i __b) -{ -#if (__clang_major__ < 14) - return (__m128i)__builtin_ia32_pminub128((__v16qi)__a, (__v16qi)__b); -#else - return (__m128i)__builtin_elementwise_min((__v16qu)__a, (__v16qu)__b); -#endif -} - -/// Multiplies the corresponding elements of two signed [8 x i16] -/// vectors, saving the upper 16 bits of each 32-bit product in the -/// corresponding element of a 128-bit signed [8 x i16] result vector. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPMULHW / PMULHW instruction. -/// -/// \param __a -/// A 128-bit signed [8 x i16] vector. -/// \param __b -/// A 128-bit signed [8 x i16] vector. -/// \returns A 128-bit signed [8 x i16] vector containing the upper 16 bits of -/// each of the eight 32-bit products. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mulhi_epi16(__m128i __a, __m128i __b) -{ - return (__m128i)__builtin_ia32_pmulhw128((__v8hi)__a, (__v8hi)__b); -} - -/// Multiplies the corresponding elements of two unsigned [8 x i16] -/// vectors, saving the upper 16 bits of each 32-bit product in the -/// corresponding element of a 128-bit unsigned [8 x i16] result vector. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPMULHUW / PMULHUW instruction. -/// -/// \param __a -/// A 128-bit unsigned [8 x i16] vector. -/// \param __b -/// A 128-bit unsigned [8 x i16] vector. -/// \returns A 128-bit unsigned [8 x i16] vector containing the upper 16 bits -/// of each of the eight 32-bit products. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mulhi_epu16(__m128i __a, __m128i __b) -{ - return (__m128i)__builtin_ia32_pmulhuw128((__v8hi)__a, (__v8hi)__b); -} - -/// Multiplies the corresponding elements of two signed [8 x i16] -/// vectors, saving the lower 16 bits of each 32-bit product in the -/// corresponding element of a 128-bit signed [8 x i16] result vector. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPMULLW / PMULLW instruction. -/// -/// \param __a -/// A 128-bit signed [8 x i16] vector. -/// \param __b -/// A 128-bit signed [8 x i16] vector. -/// \returns A 128-bit signed [8 x i16] vector containing the lower 16 bits of -/// each of the eight 32-bit products. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mullo_epi16(__m128i __a, __m128i __b) -{ - return (__m128i)((__v8hu)__a * (__v8hu)__b); -} - -/// Multiplies 32-bit unsigned integer values contained in the lower bits -/// of the two 64-bit integer vectors and returns the 64-bit unsigned -/// product. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PMULUDQ instruction. -/// -/// \param __a -/// A 64-bit integer containing one of the source operands. -/// \param __b -/// A 64-bit integer containing one of the source operands. -/// \returns A 64-bit integer vector containing the product of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX -_mm_mul_su32(__m64 __a, __m64 __b) -{ - return __builtin_ia32_pmuludq((__v2si)__a, (__v2si)__b); -} - -/// Multiplies 32-bit unsigned integer values contained in the lower -/// bits of the corresponding elements of two [2 x i64] vectors, and returns -/// the 64-bit products in the corresponding elements of a [2 x i64] vector. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPMULUDQ / PMULUDQ instruction. -/// -/// \param __a -/// A [2 x i64] vector containing one of the source operands. -/// \param __b -/// A [2 x i64] vector containing one of the source operands. -/// \returns A [2 x i64] vector containing the product of both operands. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mul_epu32(__m128i __a, __m128i __b) -{ - return __builtin_ia32_pmuludq128((__v4si)__a, (__v4si)__b); -} - -/// Computes the absolute differences of corresponding 8-bit integer -/// values in two 128-bit vectors. Sums the first 8 absolute differences, and -/// separately sums the second 8 absolute differences. Packs these two -/// unsigned 16-bit integer sums into the upper and lower elements of a -/// [2 x i64] vector. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPSADBW / PSADBW instruction. -/// -/// \param __a -/// A 128-bit integer vector containing one of the source operands. -/// \param __b -/// A 128-bit integer vector containing one of the source operands. -/// \returns A [2 x i64] vector containing the sums of the sets of absolute -/// differences between both operands. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_sad_epu8(__m128i __a, __m128i __b) -{ - return __builtin_ia32_psadbw128((__v16qi)__a, (__v16qi)__b); -} - -/// Subtracts the corresponding 8-bit integer values in the operands. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPSUBB / PSUBB instruction. -/// -/// \param __a -/// A 128-bit integer vector containing the minuends. -/// \param __b -/// A 128-bit integer vector containing the subtrahends. -/// \returns A 128-bit integer vector containing the differences of the values -/// in the operands. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_sub_epi8(__m128i __a, __m128i __b) -{ - return (__m128i)((__v16qu)__a - (__v16qu)__b); -} - -/// Subtracts the corresponding 16-bit integer values in the operands. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPSUBW / PSUBW instruction. -/// -/// \param __a -/// A 128-bit integer vector containing the minuends. -/// \param __b -/// A 128-bit integer vector containing the subtrahends. -/// \returns A 128-bit integer vector containing the differences of the values -/// in the operands. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_sub_epi16(__m128i __a, __m128i __b) -{ - return (__m128i)((__v8hu)__a - (__v8hu)__b); -} - -/// Subtracts the corresponding 32-bit integer values in the operands. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPSUBD / PSUBD instruction. -/// -/// \param __a -/// A 128-bit integer vector containing the minuends. -/// \param __b -/// A 128-bit integer vector containing the subtrahends. -/// \returns A 128-bit integer vector containing the differences of the values -/// in the operands. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_sub_epi32(__m128i __a, __m128i __b) -{ - return (__m128i)((__v4su)__a - (__v4su)__b); -} - -/// Subtracts signed or unsigned 64-bit integer values and writes the -/// difference to the corresponding bits in the destination. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PSUBQ instruction. -/// -/// \param __a -/// A 64-bit integer vector containing the minuend. -/// \param __b -/// A 64-bit integer vector containing the subtrahend. -/// \returns A 64-bit integer vector containing the difference of the values in -/// the operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX -_mm_sub_si64(__m64 __a, __m64 __b) -{ - return (__m64)__builtin_ia32_psubq((__v1di)__a, (__v1di)__b); -} - -/// Subtracts the corresponding elements of two [2 x i64] vectors. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPSUBQ / PSUBQ instruction. -/// -/// \param __a -/// A 128-bit integer vector containing the minuends. -/// \param __b -/// A 128-bit integer vector containing the subtrahends. -/// \returns A 128-bit integer vector containing the differences of the values -/// in the operands. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_sub_epi64(__m128i __a, __m128i __b) -{ - return (__m128i)((__v2du)__a - (__v2du)__b); -} - -/// Subtracts corresponding 8-bit signed integer values in the input and -/// returns the differences in the corresponding bytes in the destination. -/// Differences greater than 0x7F are saturated to 0x7F, and differences less -/// than 0x80 are saturated to 0x80. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPSUBSB / PSUBSB instruction. -/// -/// \param __a -/// A 128-bit integer vector containing the minuends. -/// \param __b -/// A 128-bit integer vector containing the subtrahends. -/// \returns A 128-bit integer vector containing the differences of the values -/// in the operands. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_subs_epi8(__m128i __a, __m128i __b) -{ -#if (__clang_major__ > 14) - return (__m128i)__builtin_elementwise_sub_sat((__v16qs)__a, (__v16qs)__b); -#else - return (__m128i)__builtin_ia32_psubsb128((__v16qi)__a, (__v16qi)__b); -#endif -} - -/// Subtracts corresponding 16-bit signed integer values in the input and -/// returns the differences in the corresponding bytes in the destination. -/// Differences greater than 0x7FFF are saturated to 0x7FFF, and values less -/// than 0x8000 are saturated to 0x8000. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPSUBSW / PSUBSW instruction. -/// -/// \param __a -/// A 128-bit integer vector containing the minuends. -/// \param __b -/// A 128-bit integer vector containing the subtrahends. -/// \returns A 128-bit integer vector containing the differences of the values -/// in the operands. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_subs_epi16(__m128i __a, __m128i __b) -{ -#if (__clang_major__ > 14) - return (__m128i)__builtin_elementwise_sub_sat((__v8hi)__a, (__v8hi)__b); -#else - return (__m128i)__builtin_ia32_psubsw128((__v8hi)__a, (__v8hi)__b); -#endif -} - -/// Subtracts corresponding 8-bit unsigned integer values in the input -/// and returns the differences in the corresponding bytes in the -/// destination. Differences less than 0x00 are saturated to 0x00. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPSUBUSB / PSUBUSB instruction. -/// -/// \param __a -/// A 128-bit integer vector containing the minuends. -/// \param __b -/// A 128-bit integer vector containing the subtrahends. -/// \returns A 128-bit integer vector containing the unsigned integer -/// differences of the values in the operands. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_subs_epu8(__m128i __a, __m128i __b) -{ -#if (__clang_major__ > 14) - return (__m128i)__builtin_elementwise_sub_sat((__v16qu)__a, (__v16qu)__b); -#else - return (__m128i)__builtin_ia32_psubusb128((__v16qi)__a, (__v16qi)__b); -#endif -} - -/// Subtracts corresponding 16-bit unsigned integer values in the input -/// and returns the differences in the corresponding bytes in the -/// destination. Differences less than 0x0000 are saturated to 0x0000. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPSUBUSW / PSUBUSW instruction. -/// -/// \param __a -/// A 128-bit integer vector containing the minuends. -/// \param __b -/// A 128-bit integer vector containing the subtrahends. -/// \returns A 128-bit integer vector containing the unsigned integer -/// differences of the values in the operands. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_subs_epu16(__m128i __a, __m128i __b) -{ -#if (__clang_major__ > 14) - return (__m128i)__builtin_elementwise_sub_sat((__v8hu)__a, (__v8hu)__b); -#else - return (__m128i)__builtin_ia32_psubusw128((__v8hi)__a, (__v8hi)__b); -#endif -} - -/// Performs a bitwise AND of two 128-bit integer vectors. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPAND / PAND instruction. -/// -/// \param __a -/// A 128-bit integer vector containing one of the source operands. -/// \param __b -/// A 128-bit integer vector containing one of the source operands. -/// \returns A 128-bit integer vector containing the bitwise AND of the values -/// in both operands. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_and_si128(__m128i __a, __m128i __b) -{ - return (__m128i)((__v2du)__a & (__v2du)__b); -} - -/// Performs a bitwise AND of two 128-bit integer vectors, using the -/// one's complement of the values contained in the first source operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPANDN / PANDN instruction. -/// -/// \param __a -/// A 128-bit vector containing the left source operand. The one's complement -/// of this value is used in the bitwise AND. -/// \param __b -/// A 128-bit vector containing the right source operand. -/// \returns A 128-bit integer vector containing the bitwise AND of the one's -/// complement of the first operand and the values in the second operand. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_andnot_si128(__m128i __a, __m128i __b) -{ - return (__m128i)(~(__v2du)__a & (__v2du)__b); -} -/// Performs a bitwise OR of two 128-bit integer vectors. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPOR / POR instruction. -/// -/// \param __a -/// A 128-bit integer vector containing one of the source operands. -/// \param __b -/// A 128-bit integer vector containing one of the source operands. -/// \returns A 128-bit integer vector containing the bitwise OR of the values -/// in both operands. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_or_si128(__m128i __a, __m128i __b) -{ - return (__m128i)((__v2du)__a | (__v2du)__b); -} - -/// Performs a bitwise exclusive OR of two 128-bit integer vectors. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPXOR / PXOR instruction. -/// -/// \param __a -/// A 128-bit integer vector containing one of the source operands. -/// \param __b -/// A 128-bit integer vector containing one of the source operands. -/// \returns A 128-bit integer vector containing the bitwise exclusive OR of the -/// values in both operands. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_xor_si128(__m128i __a, __m128i __b) -{ - return (__m128i)((__v2du)__a ^ (__v2du)__b); -} - -/// Left-shifts the 128-bit integer vector operand by the specified -/// number of bytes. Low-order bits are cleared. -/// -/// \headerfile -/// -/// \code -/// __m128i _mm_slli_si128(__m128i a, const int imm); -/// \endcode -/// -/// This intrinsic corresponds to the VPSLLDQ / PSLLDQ instruction. -/// -/// \param a -/// A 128-bit integer vector containing the source operand. -/// \param imm -/// An immediate value specifying the number of bytes to left-shift operand -/// \a a. -/// \returns A 128-bit integer vector containing the left-shifted value. -#define _mm_slli_si128(a, imm) \ - ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))) - -#define _mm_bslli_si128(a, imm) \ - ((__m128i)__builtin_ia32_pslldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))) - -/// Left-shifts each 16-bit value in the 128-bit integer vector operand -/// by the specified number of bits. Low-order bits are cleared. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPSLLW / PSLLW instruction. -/// -/// \param __a -/// A 128-bit integer vector containing the source operand. -/// \param __count -/// An integer value specifying the number of bits to left-shift each value -/// in operand \a __a. -/// \returns A 128-bit integer vector containing the left-shifted values. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_slli_epi16(__m128i __a, int __count) -{ - return (__m128i)__builtin_ia32_psllwi128((__v8hi)__a, __count); -} - -/// Left-shifts each 16-bit value in the 128-bit integer vector operand -/// by the specified number of bits. Low-order bits are cleared. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPSLLW / PSLLW instruction. -/// -/// \param __a -/// A 128-bit integer vector containing the source operand. -/// \param __count -/// A 128-bit integer vector in which bits [63:0] specify the number of bits -/// to left-shift each value in operand \a __a. -/// \returns A 128-bit integer vector containing the left-shifted values. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_sll_epi16(__m128i __a, __m128i __count) -{ - return (__m128i)__builtin_ia32_psllw128((__v8hi)__a, (__v8hi)__count); -} - -/// Left-shifts each 32-bit value in the 128-bit integer vector operand -/// by the specified number of bits. Low-order bits are cleared. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPSLLD / PSLLD instruction. -/// -/// \param __a -/// A 128-bit integer vector containing the source operand. -/// \param __count -/// An integer value specifying the number of bits to left-shift each value -/// in operand \a __a. -/// \returns A 128-bit integer vector containing the left-shifted values. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_slli_epi32(__m128i __a, int __count) -{ - return (__m128i)__builtin_ia32_pslldi128((__v4si)__a, __count); -} - -/// Left-shifts each 32-bit value in the 128-bit integer vector operand -/// by the specified number of bits. Low-order bits are cleared. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPSLLD / PSLLD instruction. -/// -/// \param __a -/// A 128-bit integer vector containing the source operand. -/// \param __count -/// A 128-bit integer vector in which bits [63:0] specify the number of bits -/// to left-shift each value in operand \a __a. -/// \returns A 128-bit integer vector containing the left-shifted values. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_sll_epi32(__m128i __a, __m128i __count) -{ - return (__m128i)__builtin_ia32_pslld128((__v4si)__a, (__v4si)__count); -} - -/// Left-shifts each 64-bit value in the 128-bit integer vector operand -/// by the specified number of bits. Low-order bits are cleared. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPSLLQ / PSLLQ instruction. -/// -/// \param __a -/// A 128-bit integer vector containing the source operand. -/// \param __count -/// An integer value specifying the number of bits to left-shift each value -/// in operand \a __a. -/// \returns A 128-bit integer vector containing the left-shifted values. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_slli_epi64(__m128i __a, int __count) -{ - return __builtin_ia32_psllqi128((__v2di)__a, __count); -} - -/// Left-shifts each 64-bit value in the 128-bit integer vector operand -/// by the specified number of bits. Low-order bits are cleared. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPSLLQ / PSLLQ instruction. -/// -/// \param __a -/// A 128-bit integer vector containing the source operand. -/// \param __count -/// A 128-bit integer vector in which bits [63:0] specify the number of bits -/// to left-shift each value in operand \a __a. -/// \returns A 128-bit integer vector containing the left-shifted values. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_sll_epi64(__m128i __a, __m128i __count) -{ - return __builtin_ia32_psllq128((__v2di)__a, (__v2di)__count); -} - -/// Right-shifts each 16-bit value in the 128-bit integer vector operand -/// by the specified number of bits. High-order bits are filled with the sign -/// bit of the initial value. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPSRAW / PSRAW instruction. -/// -/// \param __a -/// A 128-bit integer vector containing the source operand. -/// \param __count -/// An integer value specifying the number of bits to right-shift each value -/// in operand \a __a. -/// \returns A 128-bit integer vector containing the right-shifted values. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_srai_epi16(__m128i __a, int __count) -{ - return (__m128i)__builtin_ia32_psrawi128((__v8hi)__a, __count); -} - -/// Right-shifts each 16-bit value in the 128-bit integer vector operand -/// by the specified number of bits. High-order bits are filled with the sign -/// bit of the initial value. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPSRAW / PSRAW instruction. -/// -/// \param __a -/// A 128-bit integer vector containing the source operand. -/// \param __count -/// A 128-bit integer vector in which bits [63:0] specify the number of bits -/// to right-shift each value in operand \a __a. -/// \returns A 128-bit integer vector containing the right-shifted values. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_sra_epi16(__m128i __a, __m128i __count) -{ - return (__m128i)__builtin_ia32_psraw128((__v8hi)__a, (__v8hi)__count); -} - -/// Right-shifts each 32-bit value in the 128-bit integer vector operand -/// by the specified number of bits. High-order bits are filled with the sign -/// bit of the initial value. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPSRAD / PSRAD instruction. -/// -/// \param __a -/// A 128-bit integer vector containing the source operand. -/// \param __count -/// An integer value specifying the number of bits to right-shift each value -/// in operand \a __a. -/// \returns A 128-bit integer vector containing the right-shifted values. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_srai_epi32(__m128i __a, int __count) -{ - return (__m128i)__builtin_ia32_psradi128((__v4si)__a, __count); -} - -/// Right-shifts each 32-bit value in the 128-bit integer vector operand -/// by the specified number of bits. High-order bits are filled with the sign -/// bit of the initial value. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPSRAD / PSRAD instruction. -/// -/// \param __a -/// A 128-bit integer vector containing the source operand. -/// \param __count -/// A 128-bit integer vector in which bits [63:0] specify the number of bits -/// to right-shift each value in operand \a __a. -/// \returns A 128-bit integer vector containing the right-shifted values. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_sra_epi32(__m128i __a, __m128i __count) -{ - return (__m128i)__builtin_ia32_psrad128((__v4si)__a, (__v4si)__count); -} - -/// Right-shifts the 128-bit integer vector operand by the specified -/// number of bytes. High-order bits are cleared. -/// -/// \headerfile -/// -/// \code -/// __m128i _mm_srli_si128(__m128i a, const int imm); -/// \endcode -/// -/// This intrinsic corresponds to the VPSRLDQ / PSRLDQ instruction. -/// -/// \param a -/// A 128-bit integer vector containing the source operand. -/// \param imm -/// An immediate value specifying the number of bytes to right-shift operand -/// \a a. -/// \returns A 128-bit integer vector containing the right-shifted value. -#define _mm_srli_si128(a, imm) \ - ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))) - -#define _mm_bsrli_si128(a, imm) \ - ((__m128i)__builtin_ia32_psrldqi128_byteshift((__v2di)(__m128i)(a), (int)(imm))) - -/// Right-shifts each of 16-bit values in the 128-bit integer vector -/// operand by the specified number of bits. High-order bits are cleared. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPSRLW / PSRLW instruction. -/// -/// \param __a -/// A 128-bit integer vector containing the source operand. -/// \param __count -/// An integer value specifying the number of bits to right-shift each value -/// in operand \a __a. -/// \returns A 128-bit integer vector containing the right-shifted values. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_srli_epi16(__m128i __a, int __count) -{ - return (__m128i)__builtin_ia32_psrlwi128((__v8hi)__a, __count); -} - -/// Right-shifts each of 16-bit values in the 128-bit integer vector -/// operand by the specified number of bits. High-order bits are cleared. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPSRLW / PSRLW instruction. -/// -/// \param __a -/// A 128-bit integer vector containing the source operand. -/// \param __count -/// A 128-bit integer vector in which bits [63:0] specify the number of bits -/// to right-shift each value in operand \a __a. -/// \returns A 128-bit integer vector containing the right-shifted values. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_srl_epi16(__m128i __a, __m128i __count) -{ - return (__m128i)__builtin_ia32_psrlw128((__v8hi)__a, (__v8hi)__count); -} - -/// Right-shifts each of 32-bit values in the 128-bit integer vector -/// operand by the specified number of bits. High-order bits are cleared. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPSRLD / PSRLD instruction. -/// -/// \param __a -/// A 128-bit integer vector containing the source operand. -/// \param __count -/// An integer value specifying the number of bits to right-shift each value -/// in operand \a __a. -/// \returns A 128-bit integer vector containing the right-shifted values. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_srli_epi32(__m128i __a, int __count) -{ - return (__m128i)__builtin_ia32_psrldi128((__v4si)__a, __count); -} - -/// Right-shifts each of 32-bit values in the 128-bit integer vector -/// operand by the specified number of bits. High-order bits are cleared. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPSRLD / PSRLD instruction. -/// -/// \param __a -/// A 128-bit integer vector containing the source operand. -/// \param __count -/// A 128-bit integer vector in which bits [63:0] specify the number of bits -/// to right-shift each value in operand \a __a. -/// \returns A 128-bit integer vector containing the right-shifted values. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_srl_epi32(__m128i __a, __m128i __count) -{ - return (__m128i)__builtin_ia32_psrld128((__v4si)__a, (__v4si)__count); -} - -/// Right-shifts each of 64-bit values in the 128-bit integer vector -/// operand by the specified number of bits. High-order bits are cleared. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPSRLQ / PSRLQ instruction. -/// -/// \param __a -/// A 128-bit integer vector containing the source operand. -/// \param __count -/// An integer value specifying the number of bits to right-shift each value -/// in operand \a __a. -/// \returns A 128-bit integer vector containing the right-shifted values. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_srli_epi64(__m128i __a, int __count) -{ - return __builtin_ia32_psrlqi128((__v2di)__a, __count); -} - -/// Right-shifts each of 64-bit values in the 128-bit integer vector -/// operand by the specified number of bits. High-order bits are cleared. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPSRLQ / PSRLQ instruction. -/// -/// \param __a -/// A 128-bit integer vector containing the source operand. -/// \param __count -/// A 128-bit integer vector in which bits [63:0] specify the number of bits -/// to right-shift each value in operand \a __a. -/// \returns A 128-bit integer vector containing the right-shifted values. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_srl_epi64(__m128i __a, __m128i __count) -{ - return __builtin_ia32_psrlq128((__v2di)__a, (__v2di)__count); -} - -/// Compares each of the corresponding 8-bit values of the 128-bit -/// integer vectors for equality. Each comparison yields 0x0 for false, 0xFF -/// for true. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPCMPEQB / PCMPEQB instruction. -/// -/// \param __a -/// A 128-bit integer vector. -/// \param __b -/// A 128-bit integer vector. -/// \returns A 128-bit integer vector containing the comparison results. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_cmpeq_epi8(__m128i __a, __m128i __b) -{ - return (__m128i)((__v16qi)__a == (__v16qi)__b); -} - -/// Compares each of the corresponding 16-bit values of the 128-bit -/// integer vectors for equality. Each comparison yields 0x0 for false, -/// 0xFFFF for true. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPCMPEQW / PCMPEQW instruction. -/// -/// \param __a -/// A 128-bit integer vector. -/// \param __b -/// A 128-bit integer vector. -/// \returns A 128-bit integer vector containing the comparison results. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_cmpeq_epi16(__m128i __a, __m128i __b) -{ - return (__m128i)((__v8hi)__a == (__v8hi)__b); -} - -/// Compares each of the corresponding 32-bit values of the 128-bit -/// integer vectors for equality. Each comparison yields 0x0 for false, -/// 0xFFFFFFFF for true. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPCMPEQD / PCMPEQD instruction. -/// -/// \param __a -/// A 128-bit integer vector. -/// \param __b -/// A 128-bit integer vector. -/// \returns A 128-bit integer vector containing the comparison results. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_cmpeq_epi32(__m128i __a, __m128i __b) -{ - return (__m128i)((__v4si)__a == (__v4si)__b); -} - -/// Compares each of the corresponding signed 8-bit values of the 128-bit -/// integer vectors to determine if the values in the first operand are -/// greater than those in the second operand. Each comparison yields 0x0 for -/// false, 0xFF for true. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPCMPGTB / PCMPGTB instruction. -/// -/// \param __a -/// A 128-bit integer vector. -/// \param __b -/// A 128-bit integer vector. -/// \returns A 128-bit integer vector containing the comparison results. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_cmpgt_epi8(__m128i __a, __m128i __b) -{ - /* This function always performs a signed comparison, but __v16qi is a char - which may be signed or unsigned, so use __v16qs. */ - return (__m128i)((__v16qs)__a > (__v16qs)__b); -} - -/// Compares each of the corresponding signed 16-bit values of the -/// 128-bit integer vectors to determine if the values in the first operand -/// are greater than those in the second operand. -/// -/// Each comparison yields 0x0 for false, 0xFFFF for true. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPCMPGTW / PCMPGTW instruction. -/// -/// \param __a -/// A 128-bit integer vector. -/// \param __b -/// A 128-bit integer vector. -/// \returns A 128-bit integer vector containing the comparison results. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_cmpgt_epi16(__m128i __a, __m128i __b) -{ - return (__m128i)((__v8hi)__a > (__v8hi)__b); -} - -/// Compares each of the corresponding signed 32-bit values of the -/// 128-bit integer vectors to determine if the values in the first operand -/// are greater than those in the second operand. -/// -/// Each comparison yields 0x0 for false, 0xFFFFFFFF for true. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPCMPGTD / PCMPGTD instruction. -/// -/// \param __a -/// A 128-bit integer vector. -/// \param __b -/// A 128-bit integer vector. -/// \returns A 128-bit integer vector containing the comparison results. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_cmpgt_epi32(__m128i __a, __m128i __b) -{ - return (__m128i)((__v4si)__a > (__v4si)__b); -} - -/// Compares each of the corresponding signed 8-bit values of the 128-bit -/// integer vectors to determine if the values in the first operand are less -/// than those in the second operand. -/// -/// Each comparison yields 0x0 for false, 0xFF for true. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPCMPGTB / PCMPGTB instruction. -/// -/// \param __a -/// A 128-bit integer vector. -/// \param __b -/// A 128-bit integer vector. -/// \returns A 128-bit integer vector containing the comparison results. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_cmplt_epi8(__m128i __a, __m128i __b) -{ - return _mm_cmpgt_epi8(__b, __a); -} - -/// Compares each of the corresponding signed 16-bit values of the -/// 128-bit integer vectors to determine if the values in the first operand -/// are less than those in the second operand. -/// -/// Each comparison yields 0x0 for false, 0xFFFF for true. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPCMPGTW / PCMPGTW instruction. -/// -/// \param __a -/// A 128-bit integer vector. -/// \param __b -/// A 128-bit integer vector. -/// \returns A 128-bit integer vector containing the comparison results. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_cmplt_epi16(__m128i __a, __m128i __b) -{ - return _mm_cmpgt_epi16(__b, __a); -} - -/// Compares each of the corresponding signed 32-bit values of the -/// 128-bit integer vectors to determine if the values in the first operand -/// are less than those in the second operand. -/// -/// Each comparison yields 0x0 for false, 0xFFFFFFFF for true. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPCMPGTD / PCMPGTD instruction. -/// -/// \param __a -/// A 128-bit integer vector. -/// \param __b -/// A 128-bit integer vector. -/// \returns A 128-bit integer vector containing the comparison results. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_cmplt_epi32(__m128i __a, __m128i __b) -{ - return _mm_cmpgt_epi32(__b, __a); -} - -#ifdef __x86_64__ -/// Converts a 64-bit signed integer value from the second operand into a -/// double-precision value and returns it in the lower element of a [2 x -/// double] vector; the upper element of the returned vector is copied from -/// the upper element of the first operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCVTSI2SD / CVTSI2SD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. The upper 64 bits of this operand are -/// copied to the upper 64 bits of the destination. -/// \param __b -/// A 64-bit signed integer operand containing the value to be converted. -/// \returns A 128-bit vector of [2 x double] whose lower 64 bits contain the -/// converted value of the second operand. The upper 64 bits are copied from -/// the upper 64 bits of the first operand. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_cvtsi64_sd(__m128d __a, long long __b) -{ - __a[0] = __b; - return __a; -} - -/// Converts the first (lower) element of a vector of [2 x double] into a -/// 64-bit signed integer value, according to the current rounding mode. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCVTSD2SI / CVTSD2SI instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the -/// conversion. -/// \returns A 64-bit signed integer containing the converted value. -static __inline__ long long __DEFAULT_FN_ATTRS -_mm_cvtsd_si64(__m128d __a) -{ - return __builtin_ia32_cvtsd2si64((__v2df)__a); -} - -/// Converts the first (lower) element of a vector of [2 x double] into a -/// 64-bit signed integer value, truncating the result when it is inexact. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCVTTSD2SI / CVTTSD2SI -/// instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. The lower 64 bits are used in the -/// conversion. -/// \returns A 64-bit signed integer containing the converted value. -static __inline__ long long __DEFAULT_FN_ATTRS -_mm_cvttsd_si64(__m128d __a) -{ - return __builtin_ia32_cvttsd2si64((__v2df)__a); -} -#endif - -/// Converts a vector of [4 x i32] into a vector of [4 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCVTDQ2PS / CVTDQ2PS instruction. -/// -/// \param __a -/// A 128-bit integer vector. -/// \returns A 128-bit vector of [4 x float] containing the converted values. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_cvtepi32_ps(__m128i __a) -{ - return (__m128)__builtin_convertvector((__v4si)__a, __v4sf); -} - -/// Converts a vector of [4 x float] into a vector of [4 x i32]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCVTPS2DQ / CVTPS2DQ instruction. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. -/// \returns A 128-bit integer vector of [4 x i32] containing the converted -/// values. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_cvtps_epi32(__m128 __a) -{ - return (__m128i)__builtin_ia32_cvtps2dq((__v4sf)__a); -} - -/// Converts a vector of [4 x float] into a vector of [4 x i32], -/// truncating the result when it is inexact. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCVTTPS2DQ / CVTTPS2DQ -/// instruction. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. -/// \returns A 128-bit vector of [4 x i32] containing the converted values. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_cvttps_epi32(__m128 __a) -{ - return (__m128i)__builtin_ia32_cvttps2dq((__v4sf)__a); -} - -/// Returns a vector of [4 x i32] where the lowest element is the input -/// operand and the remaining elements are zero. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVD / MOVD instruction. -/// -/// \param __a -/// A 32-bit signed integer operand. -/// \returns A 128-bit vector of [4 x i32]. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_cvtsi32_si128(int __a) -{ - return __extension__ (__m128i)(__v4si){ __a, 0, 0, 0 }; -} - -#ifdef __x86_64__ -/// Returns a vector of [2 x i64] where the lower element is the input -/// operand and the upper element is zero. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVQ / MOVQ instruction. -/// -/// \param __a -/// A 64-bit signed integer operand containing the value to be converted. -/// \returns A 128-bit vector of [2 x i64] containing the converted value. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_cvtsi64_si128(long long __a) -{ - return __extension__ (__m128i)(__v2di){ __a, 0 }; -} -#endif - -/// Moves the least significant 32 bits of a vector of [4 x i32] to a -/// 32-bit signed integer value. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVD / MOVD instruction. -/// -/// \param __a -/// A vector of [4 x i32]. The least significant 32 bits are moved to the -/// destination. -/// \returns A 32-bit signed integer containing the moved value. -static __inline__ int __DEFAULT_FN_ATTRS -_mm_cvtsi128_si32(__m128i __a) -{ - __v4si __b = (__v4si)__a; - return __b[0]; -} - -#ifdef __x86_64__ -/// Moves the least significant 64 bits of a vector of [2 x i64] to a -/// 64-bit signed integer value. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVQ / MOVQ instruction. -/// -/// \param __a -/// A vector of [2 x i64]. The least significant 64 bits are moved to the -/// destination. -/// \returns A 64-bit signed integer containing the moved value. -static __inline__ long long __DEFAULT_FN_ATTRS -_mm_cvtsi128_si64(__m128i __a) -{ - return __a[0]; -} -#endif - -/// Moves packed integer values from an aligned 128-bit memory location -/// to elements in a 128-bit integer vector. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVDQA / MOVDQA instruction. -/// -/// \param __p -/// An aligned pointer to a memory location containing integer values. -/// \returns A 128-bit integer vector containing the moved values. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_load_si128(__m128i const *__p) -{ - return *__p; -} - -/// Moves packed integer values from an unaligned 128-bit memory location -/// to elements in a 128-bit integer vector. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVDQU / MOVDQU instruction. -/// -/// \param __p -/// A pointer to a memory location containing integer values. -/// \returns A 128-bit integer vector containing the moved values. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_loadu_si128(__m128i_u const *__p) -{ - struct __loadu_si128 { - __m128i_u __v; - } __attribute__((__packed__, __may_alias__)); - return ((const struct __loadu_si128*)__p)->__v; -} - -/// Returns a vector of [2 x i64] where the lower element is taken from -/// the lower element of the operand, and the upper element is zero. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVQ / MOVQ instruction. -/// -/// \param __p -/// A 128-bit vector of [2 x i64]. Bits [63:0] are written to bits [63:0] of -/// the destination. -/// \returns A 128-bit vector of [2 x i64]. The lower order bits contain the -/// moved value. The higher order bits are cleared. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_loadl_epi64(__m128i_u const *__p) -{ - struct __mm_loadl_epi64_struct { - long long __u; - } __attribute__((__packed__, __may_alias__)); - return __extension__ (__m128i) { ((const struct __mm_loadl_epi64_struct*)__p)->__u, 0}; -} - -/// Generates a 128-bit vector of [4 x i32] with unspecified content. -/// This could be used as an argument to another intrinsic function where the -/// argument is required but the value is not actually used. -/// -/// \headerfile -/// -/// This intrinsic has no corresponding instruction. -/// -/// \returns A 128-bit vector of [4 x i32] with unspecified content. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_undefined_si128(void) -{ - return (__m128i)__builtin_ia32_undef128(); -} - -/// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with -/// the specified 64-bit integer values. -/// -/// \headerfile -/// -/// This intrinsic is a utility function and does not correspond to a specific -/// instruction. -/// -/// \param __q1 -/// A 64-bit integer value used to initialize the upper 64 bits of the -/// destination vector of [2 x i64]. -/// \param __q0 -/// A 64-bit integer value used to initialize the lower 64 bits of the -/// destination vector of [2 x i64]. -/// \returns An initialized 128-bit vector of [2 x i64] containing the values -/// provided in the operands. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_set_epi64x(long long __q1, long long __q0) -{ - return __extension__ (__m128i)(__v2di){ __q0, __q1 }; -} - -/// Initializes both 64-bit values in a 128-bit vector of [2 x i64] with -/// the specified 64-bit integer values. -/// -/// \headerfile -/// -/// This intrinsic is a utility function and does not correspond to a specific -/// instruction. -/// -/// \param __q1 -/// A 64-bit integer value used to initialize the upper 64 bits of the -/// destination vector of [2 x i64]. -/// \param __q0 -/// A 64-bit integer value used to initialize the lower 64 bits of the -/// destination vector of [2 x i64]. -/// \returns An initialized 128-bit vector of [2 x i64] containing the values -/// provided in the operands. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_set_epi64(__m64 __q1, __m64 __q0) -{ - return _mm_set_epi64x((long long)__q1, (long long)__q0); -} - -/// Initializes the 32-bit values in a 128-bit vector of [4 x i32] with -/// the specified 32-bit integer values. -/// -/// \headerfile -/// -/// This intrinsic is a utility function and does not correspond to a specific -/// instruction. -/// -/// \param __i3 -/// A 32-bit integer value used to initialize bits [127:96] of the -/// destination vector. -/// \param __i2 -/// A 32-bit integer value used to initialize bits [95:64] of the destination -/// vector. -/// \param __i1 -/// A 32-bit integer value used to initialize bits [63:32] of the destination -/// vector. -/// \param __i0 -/// A 32-bit integer value used to initialize bits [31:0] of the destination -/// vector. -/// \returns An initialized 128-bit vector of [4 x i32] containing the values -/// provided in the operands. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_set_epi32(int __i3, int __i2, int __i1, int __i0) -{ - return __extension__ (__m128i)(__v4si){ __i0, __i1, __i2, __i3}; -} - -/// Initializes the 16-bit values in a 128-bit vector of [8 x i16] with -/// the specified 16-bit integer values. -/// -/// \headerfile -/// -/// This intrinsic is a utility function and does not correspond to a specific -/// instruction. -/// -/// \param __w7 -/// A 16-bit integer value used to initialize bits [127:112] of the -/// destination vector. -/// \param __w6 -/// A 16-bit integer value used to initialize bits [111:96] of the -/// destination vector. -/// \param __w5 -/// A 16-bit integer value used to initialize bits [95:80] of the destination -/// vector. -/// \param __w4 -/// A 16-bit integer value used to initialize bits [79:64] of the destination -/// vector. -/// \param __w3 -/// A 16-bit integer value used to initialize bits [63:48] of the destination -/// vector. -/// \param __w2 -/// A 16-bit integer value used to initialize bits [47:32] of the destination -/// vector. -/// \param __w1 -/// A 16-bit integer value used to initialize bits [31:16] of the destination -/// vector. -/// \param __w0 -/// A 16-bit integer value used to initialize bits [15:0] of the destination -/// vector. -/// \returns An initialized 128-bit vector of [8 x i16] containing the values -/// provided in the operands. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_set_epi16(short __w7, short __w6, short __w5, short __w4, short __w3, short __w2, short __w1, short __w0) -{ - return __extension__ (__m128i)(__v8hi){ __w0, __w1, __w2, __w3, __w4, __w5, __w6, __w7 }; -} - -/// Initializes the 8-bit values in a 128-bit vector of [16 x i8] with -/// the specified 8-bit integer values. -/// -/// \headerfile -/// -/// This intrinsic is a utility function and does not correspond to a specific -/// instruction. -/// -/// \param __b15 -/// Initializes bits [127:120] of the destination vector. -/// \param __b14 -/// Initializes bits [119:112] of the destination vector. -/// \param __b13 -/// Initializes bits [111:104] of the destination vector. -/// \param __b12 -/// Initializes bits [103:96] of the destination vector. -/// \param __b11 -/// Initializes bits [95:88] of the destination vector. -/// \param __b10 -/// Initializes bits [87:80] of the destination vector. -/// \param __b9 -/// Initializes bits [79:72] of the destination vector. -/// \param __b8 -/// Initializes bits [71:64] of the destination vector. -/// \param __b7 -/// Initializes bits [63:56] of the destination vector. -/// \param __b6 -/// Initializes bits [55:48] of the destination vector. -/// \param __b5 -/// Initializes bits [47:40] of the destination vector. -/// \param __b4 -/// Initializes bits [39:32] of the destination vector. -/// \param __b3 -/// Initializes bits [31:24] of the destination vector. -/// \param __b2 -/// Initializes bits [23:16] of the destination vector. -/// \param __b1 -/// Initializes bits [15:8] of the destination vector. -/// \param __b0 -/// Initializes bits [7:0] of the destination vector. -/// \returns An initialized 128-bit vector of [16 x i8] containing the values -/// provided in the operands. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_set_epi8(char __b15, char __b14, char __b13, char __b12, char __b11, char __b10, char __b9, char __b8, char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, char __b1, char __b0) -{ - return __extension__ (__m128i)(__v16qi){ __b0, __b1, __b2, __b3, __b4, __b5, __b6, __b7, __b8, __b9, __b10, __b11, __b12, __b13, __b14, __b15 }; -} - -/// Initializes both values in a 128-bit integer vector with the -/// specified 64-bit integer value. -/// -/// \headerfile -/// -/// This intrinsic is a utility function and does not correspond to a specific -/// instruction. -/// -/// \param __q -/// Integer value used to initialize the elements of the destination integer -/// vector. -/// \returns An initialized 128-bit integer vector of [2 x i64] with both -/// elements containing the value provided in the operand. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_set1_epi64x(long long __q) -{ - return _mm_set_epi64x(__q, __q); -} - -/// Initializes both values in a 128-bit vector of [2 x i64] with the -/// specified 64-bit value. -/// -/// \headerfile -/// -/// This intrinsic is a utility function and does not correspond to a specific -/// instruction. -/// -/// \param __q -/// A 64-bit value used to initialize the elements of the destination integer -/// vector. -/// \returns An initialized 128-bit vector of [2 x i64] with all elements -/// containing the value provided in the operand. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_set1_epi64(__m64 __q) -{ - return _mm_set_epi64(__q, __q); -} - -/// Initializes all values in a 128-bit vector of [4 x i32] with the -/// specified 32-bit value. -/// -/// \headerfile -/// -/// This intrinsic is a utility function and does not correspond to a specific -/// instruction. -/// -/// \param __i -/// A 32-bit value used to initialize the elements of the destination integer -/// vector. -/// \returns An initialized 128-bit vector of [4 x i32] with all elements -/// containing the value provided in the operand. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_set1_epi32(int __i) -{ - return _mm_set_epi32(__i, __i, __i, __i); -} - -/// Initializes all values in a 128-bit vector of [8 x i16] with the -/// specified 16-bit value. -/// -/// \headerfile -/// -/// This intrinsic is a utility function and does not correspond to a specific -/// instruction. -/// -/// \param __w -/// A 16-bit value used to initialize the elements of the destination integer -/// vector. -/// \returns An initialized 128-bit vector of [8 x i16] with all elements -/// containing the value provided in the operand. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_set1_epi16(short __w) -{ - return _mm_set_epi16(__w, __w, __w, __w, __w, __w, __w, __w); -} - -/// Initializes all values in a 128-bit vector of [16 x i8] with the -/// specified 8-bit value. -/// -/// \headerfile -/// -/// This intrinsic is a utility function and does not correspond to a specific -/// instruction. -/// -/// \param __b -/// An 8-bit value used to initialize the elements of the destination integer -/// vector. -/// \returns An initialized 128-bit vector of [16 x i8] with all elements -/// containing the value provided in the operand. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_set1_epi8(char __b) -{ - return _mm_set_epi8(__b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b, __b); -} - -/// Constructs a 128-bit integer vector, initialized in reverse order -/// with the specified 64-bit integral values. -/// -/// \headerfile -/// -/// This intrinsic does not correspond to a specific instruction. -/// -/// \param __q0 -/// A 64-bit integral value used to initialize the lower 64 bits of the -/// result. -/// \param __q1 -/// A 64-bit integral value used to initialize the upper 64 bits of the -/// result. -/// \returns An initialized 128-bit integer vector. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_setr_epi64(__m64 __q0, __m64 __q1) -{ - return _mm_set_epi64(__q1, __q0); -} - -/// Constructs a 128-bit integer vector, initialized in reverse order -/// with the specified 32-bit integral values. -/// -/// \headerfile -/// -/// This intrinsic is a utility function and does not correspond to a specific -/// instruction. -/// -/// \param __i0 -/// A 32-bit integral value used to initialize bits [31:0] of the result. -/// \param __i1 -/// A 32-bit integral value used to initialize bits [63:32] of the result. -/// \param __i2 -/// A 32-bit integral value used to initialize bits [95:64] of the result. -/// \param __i3 -/// A 32-bit integral value used to initialize bits [127:96] of the result. -/// \returns An initialized 128-bit integer vector. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_setr_epi32(int __i0, int __i1, int __i2, int __i3) -{ - return _mm_set_epi32(__i3, __i2, __i1, __i0); -} - -/// Constructs a 128-bit integer vector, initialized in reverse order -/// with the specified 16-bit integral values. -/// -/// \headerfile -/// -/// This intrinsic is a utility function and does not correspond to a specific -/// instruction. -/// -/// \param __w0 -/// A 16-bit integral value used to initialize bits [15:0] of the result. -/// \param __w1 -/// A 16-bit integral value used to initialize bits [31:16] of the result. -/// \param __w2 -/// A 16-bit integral value used to initialize bits [47:32] of the result. -/// \param __w3 -/// A 16-bit integral value used to initialize bits [63:48] of the result. -/// \param __w4 -/// A 16-bit integral value used to initialize bits [79:64] of the result. -/// \param __w5 -/// A 16-bit integral value used to initialize bits [95:80] of the result. -/// \param __w6 -/// A 16-bit integral value used to initialize bits [111:96] of the result. -/// \param __w7 -/// A 16-bit integral value used to initialize bits [127:112] of the result. -/// \returns An initialized 128-bit integer vector. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_setr_epi16(short __w0, short __w1, short __w2, short __w3, short __w4, short __w5, short __w6, short __w7) -{ - return _mm_set_epi16(__w7, __w6, __w5, __w4, __w3, __w2, __w1, __w0); -} - -/// Constructs a 128-bit integer vector, initialized in reverse order -/// with the specified 8-bit integral values. -/// -/// \headerfile -/// -/// This intrinsic is a utility function and does not correspond to a specific -/// instruction. -/// -/// \param __b0 -/// An 8-bit integral value used to initialize bits [7:0] of the result. -/// \param __b1 -/// An 8-bit integral value used to initialize bits [15:8] of the result. -/// \param __b2 -/// An 8-bit integral value used to initialize bits [23:16] of the result. -/// \param __b3 -/// An 8-bit integral value used to initialize bits [31:24] of the result. -/// \param __b4 -/// An 8-bit integral value used to initialize bits [39:32] of the result. -/// \param __b5 -/// An 8-bit integral value used to initialize bits [47:40] of the result. -/// \param __b6 -/// An 8-bit integral value used to initialize bits [55:48] of the result. -/// \param __b7 -/// An 8-bit integral value used to initialize bits [63:56] of the result. -/// \param __b8 -/// An 8-bit integral value used to initialize bits [71:64] of the result. -/// \param __b9 -/// An 8-bit integral value used to initialize bits [79:72] of the result. -/// \param __b10 -/// An 8-bit integral value used to initialize bits [87:80] of the result. -/// \param __b11 -/// An 8-bit integral value used to initialize bits [95:88] of the result. -/// \param __b12 -/// An 8-bit integral value used to initialize bits [103:96] of the result. -/// \param __b13 -/// An 8-bit integral value used to initialize bits [111:104] of the result. -/// \param __b14 -/// An 8-bit integral value used to initialize bits [119:112] of the result. -/// \param __b15 -/// An 8-bit integral value used to initialize bits [127:120] of the result. -/// \returns An initialized 128-bit integer vector. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_setr_epi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, char __b6, char __b7, char __b8, char __b9, char __b10, char __b11, char __b12, char __b13, char __b14, char __b15) -{ - return _mm_set_epi8(__b15, __b14, __b13, __b12, __b11, __b10, __b9, __b8, __b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0); -} - -/// Creates a 128-bit integer vector initialized to zero. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VXORPS / XORPS instruction. -/// -/// \returns An initialized 128-bit integer vector with all elements set to -/// zero. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_setzero_si128(void) -{ - return __extension__ (__m128i)(__v2di){ 0LL, 0LL }; -} - -/// Stores a 128-bit integer vector to a memory location aligned on a -/// 128-bit boundary. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVAPS / MOVAPS instruction. -/// -/// \param __p -/// A pointer to an aligned memory location that will receive the integer -/// values. -/// \param __b -/// A 128-bit integer vector containing the values to be moved. -static __inline__ void __DEFAULT_FN_ATTRS -_mm_store_si128(__m128i *__p, __m128i __b) -{ - *__p = __b; -} - -/// Stores a 128-bit integer vector to an unaligned memory location. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVUPS / MOVUPS instruction. -/// -/// \param __p -/// A pointer to a memory location that will receive the integer values. -/// \param __b -/// A 128-bit integer vector containing the values to be moved. -static __inline__ void __DEFAULT_FN_ATTRS -_mm_storeu_si128(__m128i_u *__p, __m128i __b) -{ - struct __storeu_si128 { - __m128i_u __v; - } __attribute__((__packed__, __may_alias__)); - ((struct __storeu_si128*)__p)->__v = __b; -} - -/// Stores a 64-bit integer value from the low element of a 128-bit integer -/// vector. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVQ / MOVQ instruction. -/// -/// \param __p -/// A pointer to a 64-bit memory location. The address of the memory -/// location does not have to be aligned. -/// \param __b -/// A 128-bit integer vector containing the value to be stored. -static __inline__ void __DEFAULT_FN_ATTRS -_mm_storeu_si64(void *__p, __m128i __b) -{ - struct __storeu_si64 { - long long __v; - } __attribute__((__packed__, __may_alias__)); - ((struct __storeu_si64*)__p)->__v = ((__v2di)__b)[0]; -} - -/// Stores a 32-bit integer value from the low element of a 128-bit integer -/// vector. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVD / MOVD instruction. -/// -/// \param __p -/// A pointer to a 32-bit memory location. The address of the memory -/// location does not have to be aligned. -/// \param __b -/// A 128-bit integer vector containing the value to be stored. -static __inline__ void __DEFAULT_FN_ATTRS -_mm_storeu_si32(void *__p, __m128i __b) -{ - struct __storeu_si32 { - int __v; - } __attribute__((__packed__, __may_alias__)); - ((struct __storeu_si32*)__p)->__v = ((__v4si)__b)[0]; -} - -/// Stores a 16-bit integer value from the low element of a 128-bit integer -/// vector. -/// -/// \headerfile -/// -/// This intrinsic does not correspond to a specific instruction. -/// -/// \param __p -/// A pointer to a 16-bit memory location. The address of the memory -/// location does not have to be aligned. -/// \param __b -/// A 128-bit integer vector containing the value to be stored. -static __inline__ void __DEFAULT_FN_ATTRS -_mm_storeu_si16(void *__p, __m128i __b) -{ - struct __storeu_si16 { - short __v; - } __attribute__((__packed__, __may_alias__)); - ((struct __storeu_si16*)__p)->__v = ((__v8hi)__b)[0]; -} - -/// Moves bytes selected by the mask from the first operand to the -/// specified unaligned memory location. When a mask bit is 1, the -/// corresponding byte is written, otherwise it is not written. -/// -/// To minimize caching, the data is flagged as non-temporal (unlikely to be -/// used again soon). Exception and trap behavior for elements not selected -/// for storage to memory are implementation dependent. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMASKMOVDQU / MASKMOVDQU -/// instruction. -/// -/// \param __d -/// A 128-bit integer vector containing the values to be moved. -/// \param __n -/// A 128-bit integer vector containing the mask. The most significant bit of -/// each byte represents the mask bits. -/// \param __p -/// A pointer to an unaligned 128-bit memory location where the specified -/// values are moved. -static __inline__ void __DEFAULT_FN_ATTRS -_mm_maskmoveu_si128(__m128i __d, __m128i __n, char *__p) -{ - __builtin_ia32_maskmovdqu((__v16qi)__d, (__v16qi)__n, __p); -} - -/// Stores the lower 64 bits of a 128-bit integer vector of [2 x i64] to -/// a memory location. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVLPS / MOVLPS instruction. -/// -/// \param __p -/// A pointer to a 64-bit memory location that will receive the lower 64 bits -/// of the integer vector parameter. -/// \param __a -/// A 128-bit integer vector of [2 x i64]. The lower 64 bits contain the -/// value to be stored. -static __inline__ void __DEFAULT_FN_ATTRS -_mm_storel_epi64(__m128i_u *__p, __m128i __a) -{ - struct __mm_storel_epi64_struct { - long long __u; - } __attribute__((__packed__, __may_alias__)); - ((struct __mm_storel_epi64_struct*)__p)->__u = __a[0]; -} - -/// Stores a 128-bit floating point vector of [2 x double] to a 128-bit -/// aligned memory location. -/// -/// To minimize caching, the data is flagged as non-temporal (unlikely to be -/// used again soon). -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVNTPS / MOVNTPS instruction. -/// -/// \param __p -/// A pointer to the 128-bit aligned memory location used to store the value. -/// \param __a -/// A vector of [2 x double] containing the 64-bit values to be stored. -static __inline__ void __DEFAULT_FN_ATTRS -_mm_stream_pd(double *__p, __m128d __a) -{ - __builtin_nontemporal_store((__v2df)__a, (__v2df*)__p); -} - -/// Stores a 128-bit integer vector to a 128-bit aligned memory location. -/// -/// To minimize caching, the data is flagged as non-temporal (unlikely to be -/// used again soon). -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVNTPS / MOVNTPS instruction. -/// -/// \param __p -/// A pointer to the 128-bit aligned memory location used to store the value. -/// \param __a -/// A 128-bit integer vector containing the values to be stored. -static __inline__ void __DEFAULT_FN_ATTRS -_mm_stream_si128(__m128i *__p, __m128i __a) -{ - __builtin_nontemporal_store((__v2di)__a, (__v2di*)__p); -} - -/// Stores a 32-bit integer value in the specified memory location. -/// -/// To minimize caching, the data is flagged as non-temporal (unlikely to be -/// used again soon). -/// -/// \headerfile -/// -/// This intrinsic corresponds to the MOVNTI instruction. -/// -/// \param __p -/// A pointer to the 32-bit memory location used to store the value. -/// \param __a -/// A 32-bit integer containing the value to be stored. -static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("sse2"))) -_mm_stream_si32(int *__p, int __a) -{ - __builtin_ia32_movnti(__p, __a); -} - -#ifdef __x86_64__ -/// Stores a 64-bit integer value in the specified memory location. -/// -/// To minimize caching, the data is flagged as non-temporal (unlikely to be -/// used again soon). -/// -/// \headerfile -/// -/// This intrinsic corresponds to the MOVNTIQ instruction. -/// -/// \param __p -/// A pointer to the 64-bit memory location used to store the value. -/// \param __a -/// A 64-bit integer containing the value to be stored. -static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("sse2"))) -_mm_stream_si64(long long *__p, long long __a) -{ - __builtin_ia32_movnti64(__p, __a); -} -#endif - -#if defined(__cplusplus) -extern "C" { -#endif - -/// The cache line containing \a __p is flushed and invalidated from all -/// caches in the coherency domain. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the CLFLUSH instruction. -/// -/// \param __p -/// A pointer to the memory location used to identify the cache line to be -/// flushed. -void _mm_clflush(void const * __p); - -/// Forces strong memory ordering (serialization) between load -/// instructions preceding this instruction and load instructions following -/// this instruction, ensuring the system completes all previous loads before -/// executing subsequent loads. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the LFENCE instruction. -/// -void _mm_lfence(void); - -/// Forces strong memory ordering (serialization) between load and store -/// instructions preceding this instruction and load and store instructions -/// following this instruction, ensuring that the system completes all -/// previous memory accesses before executing subsequent memory accesses. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the MFENCE instruction. -/// -void _mm_mfence(void); - -#if defined(__cplusplus) -} // extern "C" -#endif - -/// Converts 16-bit signed integers from both 128-bit integer vector -/// operands into 8-bit signed integers, and packs the results into the -/// destination. Positive values greater than 0x7F are saturated to 0x7F. -/// Negative values less than 0x80 are saturated to 0x80. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPACKSSWB / PACKSSWB instruction. -/// -/// \param __a -/// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as -/// a signed integer and is converted to a 8-bit signed integer with -/// saturation. Values greater than 0x7F are saturated to 0x7F. Values less -/// than 0x80 are saturated to 0x80. The converted [8 x i8] values are -/// written to the lower 64 bits of the result. -/// \param __b -/// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as -/// a signed integer and is converted to a 8-bit signed integer with -/// saturation. Values greater than 0x7F are saturated to 0x7F. Values less -/// than 0x80 are saturated to 0x80. The converted [8 x i8] values are -/// written to the higher 64 bits of the result. -/// \returns A 128-bit vector of [16 x i8] containing the converted values. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_packs_epi16(__m128i __a, __m128i __b) -{ - return (__m128i)__builtin_ia32_packsswb128((__v8hi)__a, (__v8hi)__b); -} - -/// Converts 32-bit signed integers from both 128-bit integer vector -/// operands into 16-bit signed integers, and packs the results into the -/// destination. Positive values greater than 0x7FFF are saturated to 0x7FFF. -/// Negative values less than 0x8000 are saturated to 0x8000. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPACKSSDW / PACKSSDW instruction. -/// -/// \param __a -/// A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as -/// a signed integer and is converted to a 16-bit signed integer with -/// saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values -/// less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values -/// are written to the lower 64 bits of the result. -/// \param __b -/// A 128-bit integer vector of [4 x i32]. Each 32-bit element is treated as -/// a signed integer and is converted to a 16-bit signed integer with -/// saturation. Values greater than 0x7FFF are saturated to 0x7FFF. Values -/// less than 0x8000 are saturated to 0x8000. The converted [4 x i16] values -/// are written to the higher 64 bits of the result. -/// \returns A 128-bit vector of [8 x i16] containing the converted values. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_packs_epi32(__m128i __a, __m128i __b) -{ - return (__m128i)__builtin_ia32_packssdw128((__v4si)__a, (__v4si)__b); -} - -/// Converts 16-bit signed integers from both 128-bit integer vector -/// operands into 8-bit unsigned integers, and packs the results into the -/// destination. Values greater than 0xFF are saturated to 0xFF. Values less -/// than 0x00 are saturated to 0x00. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPACKUSWB / PACKUSWB instruction. -/// -/// \param __a -/// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as -/// a signed integer and is converted to an 8-bit unsigned integer with -/// saturation. Values greater than 0xFF are saturated to 0xFF. Values less -/// than 0x00 are saturated to 0x00. The converted [8 x i8] values are -/// written to the lower 64 bits of the result. -/// \param __b -/// A 128-bit integer vector of [8 x i16]. Each 16-bit element is treated as -/// a signed integer and is converted to an 8-bit unsigned integer with -/// saturation. Values greater than 0xFF are saturated to 0xFF. Values less -/// than 0x00 are saturated to 0x00. The converted [8 x i8] values are -/// written to the higher 64 bits of the result. -/// \returns A 128-bit vector of [16 x i8] containing the converted values. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_packus_epi16(__m128i __a, __m128i __b) -{ - return (__m128i)__builtin_ia32_packuswb128((__v8hi)__a, (__v8hi)__b); -} - -/// Extracts 16 bits from a 128-bit integer vector of [8 x i16], using -/// the immediate-value parameter as a selector. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPEXTRW / PEXTRW instruction. -/// -/// \param __a -/// A 128-bit integer vector. -/// \param __imm -/// An immediate value. Bits [2:0] selects values from \a __a to be assigned -/// to bits[15:0] of the result. \n -/// 000: assign values from bits [15:0] of \a __a. \n -/// 001: assign values from bits [31:16] of \a __a. \n -/// 010: assign values from bits [47:32] of \a __a. \n -/// 011: assign values from bits [63:48] of \a __a. \n -/// 100: assign values from bits [79:64] of \a __a. \n -/// 101: assign values from bits [95:80] of \a __a. \n -/// 110: assign values from bits [111:96] of \a __a. \n -/// 111: assign values from bits [127:112] of \a __a. -/// \returns An integer, whose lower 16 bits are selected from the 128-bit -/// integer vector parameter and the remaining bits are assigned zeros. -#define _mm_extract_epi16(a, imm) \ - ((int)(unsigned short)__builtin_ia32_vec_ext_v8hi((__v8hi)(__m128i)(a), \ - (int)(imm))) - -/// Constructs a 128-bit integer vector by first making a copy of the -/// 128-bit integer vector parameter, and then inserting the lower 16 bits -/// of an integer parameter into an offset specified by the immediate-value -/// parameter. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPINSRW / PINSRW instruction. -/// -/// \param __a -/// A 128-bit integer vector of [8 x i16]. This vector is copied to the -/// result and then one of the eight elements in the result is replaced by -/// the lower 16 bits of \a __b. -/// \param __b -/// An integer. The lower 16 bits of this parameter are written to the -/// result beginning at an offset specified by \a __imm. -/// \param __imm -/// An immediate value specifying the bit offset in the result at which the -/// lower 16 bits of \a __b are written. -/// \returns A 128-bit integer vector containing the constructed values. -#define _mm_insert_epi16(a, b, imm) \ - ((__m128i)__builtin_ia32_vec_set_v8hi((__v8hi)(__m128i)(a), (int)(b), \ - (int)(imm))) - -/// Copies the values of the most significant bits from each 8-bit -/// element in a 128-bit integer vector of [16 x i8] to create a 16-bit mask -/// value, zero-extends the value, and writes it to the destination. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPMOVMSKB / PMOVMSKB instruction. -/// -/// \param __a -/// A 128-bit integer vector containing the values with bits to be extracted. -/// \returns The most significant bits from each 8-bit element in \a __a, -/// written to bits [15:0]. The other bits are assigned zeros. -static __inline__ int __DEFAULT_FN_ATTRS -_mm_movemask_epi8(__m128i __a) -{ - return __builtin_ia32_pmovmskb128((__v16qi)__a); -} - -/// Constructs a 128-bit integer vector by shuffling four 32-bit -/// elements of a 128-bit integer vector parameter, using the immediate-value -/// parameter as a specifier. -/// -/// \headerfile -/// -/// \code -/// __m128i _mm_shuffle_epi32(__m128i a, const int imm); -/// \endcode -/// -/// This intrinsic corresponds to the VPSHUFD / PSHUFD instruction. -/// -/// \param a -/// A 128-bit integer vector containing the values to be copied. -/// \param imm -/// An immediate value containing an 8-bit value specifying which elements to -/// copy from a. The destinations within the 128-bit destination are assigned -/// values as follows: \n -/// Bits [1:0] are used to assign values to bits [31:0] of the result. \n -/// Bits [3:2] are used to assign values to bits [63:32] of the result. \n -/// Bits [5:4] are used to assign values to bits [95:64] of the result. \n -/// Bits [7:6] are used to assign values to bits [127:96] of the result. \n -/// Bit value assignments: \n -/// 00: assign values from bits [31:0] of \a a. \n -/// 01: assign values from bits [63:32] of \a a. \n -/// 10: assign values from bits [95:64] of \a a. \n -/// 11: assign values from bits [127:96] of \a a. -/// \returns A 128-bit integer vector containing the shuffled values. -#define _mm_shuffle_epi32(a, imm) \ - ((__m128i)__builtin_ia32_pshufd((__v4si)(__m128i)(a), (int)(imm))) - -/// Constructs a 128-bit integer vector by shuffling four lower 16-bit -/// elements of a 128-bit integer vector of [8 x i16], using the immediate -/// value parameter as a specifier. -/// -/// \headerfile -/// -/// \code -/// __m128i _mm_shufflelo_epi16(__m128i a, const int imm); -/// \endcode -/// -/// This intrinsic corresponds to the VPSHUFLW / PSHUFLW instruction. -/// -/// \param a -/// A 128-bit integer vector of [8 x i16]. Bits [127:64] are copied to bits -/// [127:64] of the result. -/// \param imm -/// An 8-bit immediate value specifying which elements to copy from \a a. \n -/// Bits[1:0] are used to assign values to bits [15:0] of the result. \n -/// Bits[3:2] are used to assign values to bits [31:16] of the result. \n -/// Bits[5:4] are used to assign values to bits [47:32] of the result. \n -/// Bits[7:6] are used to assign values to bits [63:48] of the result. \n -/// Bit value assignments: \n -/// 00: assign values from bits [15:0] of \a a. \n -/// 01: assign values from bits [31:16] of \a a. \n -/// 10: assign values from bits [47:32] of \a a. \n -/// 11: assign values from bits [63:48] of \a a. \n -/// \returns A 128-bit integer vector containing the shuffled values. -#define _mm_shufflelo_epi16(a, imm) \ - ((__m128i)__builtin_ia32_pshuflw((__v8hi)(__m128i)(a), (int)(imm))) - -/// Constructs a 128-bit integer vector by shuffling four upper 16-bit -/// elements of a 128-bit integer vector of [8 x i16], using the immediate -/// value parameter as a specifier. -/// -/// \headerfile -/// -/// \code -/// __m128i _mm_shufflehi_epi16(__m128i a, const int imm); -/// \endcode -/// -/// This intrinsic corresponds to the VPSHUFHW / PSHUFHW instruction. -/// -/// \param a -/// A 128-bit integer vector of [8 x i16]. Bits [63:0] are copied to bits -/// [63:0] of the result. -/// \param imm -/// An 8-bit immediate value specifying which elements to copy from \a a. \n -/// Bits[1:0] are used to assign values to bits [79:64] of the result. \n -/// Bits[3:2] are used to assign values to bits [95:80] of the result. \n -/// Bits[5:4] are used to assign values to bits [111:96] of the result. \n -/// Bits[7:6] are used to assign values to bits [127:112] of the result. \n -/// Bit value assignments: \n -/// 00: assign values from bits [79:64] of \a a. \n -/// 01: assign values from bits [95:80] of \a a. \n -/// 10: assign values from bits [111:96] of \a a. \n -/// 11: assign values from bits [127:112] of \a a. \n -/// \returns A 128-bit integer vector containing the shuffled values. -#define _mm_shufflehi_epi16(a, imm) \ - ((__m128i)__builtin_ia32_pshufhw((__v8hi)(__m128i)(a), (int)(imm))) - -/// Unpacks the high-order (index 8-15) values from two 128-bit vectors -/// of [16 x i8] and interleaves them into a 128-bit vector of [16 x i8]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPUNPCKHBW / PUNPCKHBW -/// instruction. -/// -/// \param __a -/// A 128-bit vector of [16 x i8]. -/// Bits [71:64] are written to bits [7:0] of the result. \n -/// Bits [79:72] are written to bits [23:16] of the result. \n -/// Bits [87:80] are written to bits [39:32] of the result. \n -/// Bits [95:88] are written to bits [55:48] of the result. \n -/// Bits [103:96] are written to bits [71:64] of the result. \n -/// Bits [111:104] are written to bits [87:80] of the result. \n -/// Bits [119:112] are written to bits [103:96] of the result. \n -/// Bits [127:120] are written to bits [119:112] of the result. -/// \param __b -/// A 128-bit vector of [16 x i8]. \n -/// Bits [71:64] are written to bits [15:8] of the result. \n -/// Bits [79:72] are written to bits [31:24] of the result. \n -/// Bits [87:80] are written to bits [47:40] of the result. \n -/// Bits [95:88] are written to bits [63:56] of the result. \n -/// Bits [103:96] are written to bits [79:72] of the result. \n -/// Bits [111:104] are written to bits [95:88] of the result. \n -/// Bits [119:112] are written to bits [111:104] of the result. \n -/// Bits [127:120] are written to bits [127:120] of the result. -/// \returns A 128-bit vector of [16 x i8] containing the interleaved values. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_unpackhi_epi8(__m128i __a, __m128i __b) -{ - return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 8, 16+8, 9, 16+9, 10, 16+10, 11, 16+11, 12, 16+12, 13, 16+13, 14, 16+14, 15, 16+15); -} - -/// Unpacks the high-order (index 4-7) values from two 128-bit vectors of -/// [8 x i16] and interleaves them into a 128-bit vector of [8 x i16]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPUNPCKHWD / PUNPCKHWD -/// instruction. -/// -/// \param __a -/// A 128-bit vector of [8 x i16]. -/// Bits [79:64] are written to bits [15:0] of the result. \n -/// Bits [95:80] are written to bits [47:32] of the result. \n -/// Bits [111:96] are written to bits [79:64] of the result. \n -/// Bits [127:112] are written to bits [111:96] of the result. -/// \param __b -/// A 128-bit vector of [8 x i16]. -/// Bits [79:64] are written to bits [31:16] of the result. \n -/// Bits [95:80] are written to bits [63:48] of the result. \n -/// Bits [111:96] are written to bits [95:80] of the result. \n -/// Bits [127:112] are written to bits [127:112] of the result. -/// \returns A 128-bit vector of [8 x i16] containing the interleaved values. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_unpackhi_epi16(__m128i __a, __m128i __b) -{ - return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 4, 8+4, 5, 8+5, 6, 8+6, 7, 8+7); -} - -/// Unpacks the high-order (index 2,3) values from two 128-bit vectors of -/// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPUNPCKHDQ / PUNPCKHDQ -/// instruction. -/// -/// \param __a -/// A 128-bit vector of [4 x i32]. \n -/// Bits [95:64] are written to bits [31:0] of the destination. \n -/// Bits [127:96] are written to bits [95:64] of the destination. -/// \param __b -/// A 128-bit vector of [4 x i32]. \n -/// Bits [95:64] are written to bits [64:32] of the destination. \n -/// Bits [127:96] are written to bits [127:96] of the destination. -/// \returns A 128-bit vector of [4 x i32] containing the interleaved values. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_unpackhi_epi32(__m128i __a, __m128i __b) -{ - return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 2, 4+2, 3, 4+3); -} - -/// Unpacks the high-order 64-bit elements from two 128-bit vectors of -/// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPUNPCKHQDQ / PUNPCKHQDQ -/// instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x i64]. \n -/// Bits [127:64] are written to bits [63:0] of the destination. -/// \param __b -/// A 128-bit vector of [2 x i64]. \n -/// Bits [127:64] are written to bits [127:64] of the destination. -/// \returns A 128-bit vector of [2 x i64] containing the interleaved values. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_unpackhi_epi64(__m128i __a, __m128i __b) -{ - return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 1, 2+1); -} - -/// Unpacks the low-order (index 0-7) values from two 128-bit vectors of -/// [16 x i8] and interleaves them into a 128-bit vector of [16 x i8]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPUNPCKLBW / PUNPCKLBW -/// instruction. -/// -/// \param __a -/// A 128-bit vector of [16 x i8]. \n -/// Bits [7:0] are written to bits [7:0] of the result. \n -/// Bits [15:8] are written to bits [23:16] of the result. \n -/// Bits [23:16] are written to bits [39:32] of the result. \n -/// Bits [31:24] are written to bits [55:48] of the result. \n -/// Bits [39:32] are written to bits [71:64] of the result. \n -/// Bits [47:40] are written to bits [87:80] of the result. \n -/// Bits [55:48] are written to bits [103:96] of the result. \n -/// Bits [63:56] are written to bits [119:112] of the result. -/// \param __b -/// A 128-bit vector of [16 x i8]. -/// Bits [7:0] are written to bits [15:8] of the result. \n -/// Bits [15:8] are written to bits [31:24] of the result. \n -/// Bits [23:16] are written to bits [47:40] of the result. \n -/// Bits [31:24] are written to bits [63:56] of the result. \n -/// Bits [39:32] are written to bits [79:72] of the result. \n -/// Bits [47:40] are written to bits [95:88] of the result. \n -/// Bits [55:48] are written to bits [111:104] of the result. \n -/// Bits [63:56] are written to bits [127:120] of the result. -/// \returns A 128-bit vector of [16 x i8] containing the interleaved values. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_unpacklo_epi8(__m128i __a, __m128i __b) -{ - return (__m128i)__builtin_shufflevector((__v16qi)__a, (__v16qi)__b, 0, 16+0, 1, 16+1, 2, 16+2, 3, 16+3, 4, 16+4, 5, 16+5, 6, 16+6, 7, 16+7); -} - -/// Unpacks the low-order (index 0-3) values from each of the two 128-bit -/// vectors of [8 x i16] and interleaves them into a 128-bit vector of -/// [8 x i16]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPUNPCKLWD / PUNPCKLWD -/// instruction. -/// -/// \param __a -/// A 128-bit vector of [8 x i16]. -/// Bits [15:0] are written to bits [15:0] of the result. \n -/// Bits [31:16] are written to bits [47:32] of the result. \n -/// Bits [47:32] are written to bits [79:64] of the result. \n -/// Bits [63:48] are written to bits [111:96] of the result. -/// \param __b -/// A 128-bit vector of [8 x i16]. -/// Bits [15:0] are written to bits [31:16] of the result. \n -/// Bits [31:16] are written to bits [63:48] of the result. \n -/// Bits [47:32] are written to bits [95:80] of the result. \n -/// Bits [63:48] are written to bits [127:112] of the result. -/// \returns A 128-bit vector of [8 x i16] containing the interleaved values. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_unpacklo_epi16(__m128i __a, __m128i __b) -{ - return (__m128i)__builtin_shufflevector((__v8hi)__a, (__v8hi)__b, 0, 8+0, 1, 8+1, 2, 8+2, 3, 8+3); -} - -/// Unpacks the low-order (index 0,1) values from two 128-bit vectors of -/// [4 x i32] and interleaves them into a 128-bit vector of [4 x i32]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPUNPCKLDQ / PUNPCKLDQ -/// instruction. -/// -/// \param __a -/// A 128-bit vector of [4 x i32]. \n -/// Bits [31:0] are written to bits [31:0] of the destination. \n -/// Bits [63:32] are written to bits [95:64] of the destination. -/// \param __b -/// A 128-bit vector of [4 x i32]. \n -/// Bits [31:0] are written to bits [64:32] of the destination. \n -/// Bits [63:32] are written to bits [127:96] of the destination. -/// \returns A 128-bit vector of [4 x i32] containing the interleaved values. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_unpacklo_epi32(__m128i __a, __m128i __b) -{ - return (__m128i)__builtin_shufflevector((__v4si)__a, (__v4si)__b, 0, 4+0, 1, 4+1); -} - -/// Unpacks the low-order 64-bit elements from two 128-bit vectors of -/// [2 x i64] and interleaves them into a 128-bit vector of [2 x i64]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPUNPCKLQDQ / PUNPCKLQDQ -/// instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x i64]. \n -/// Bits [63:0] are written to bits [63:0] of the destination. \n -/// \param __b -/// A 128-bit vector of [2 x i64]. \n -/// Bits [63:0] are written to bits [127:64] of the destination. \n -/// \returns A 128-bit vector of [2 x i64] containing the interleaved values. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_unpacklo_epi64(__m128i __a, __m128i __b) -{ - return (__m128i)__builtin_shufflevector((__v2di)__a, (__v2di)__b, 0, 2+0); -} - -/// Returns the lower 64 bits of a 128-bit integer vector as a 64-bit -/// integer. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the MOVDQ2Q instruction. -/// -/// \param __a -/// A 128-bit integer vector operand. The lower 64 bits are moved to the -/// destination. -/// \returns A 64-bit integer containing the lower 64 bits of the parameter. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_movepi64_pi64(__m128i __a) -{ - return (__m64)__a[0]; -} - -/// Moves the 64-bit operand to a 128-bit integer vector, zeroing the -/// upper bits. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the MOVD+VMOVQ instruction. -/// -/// \param __a -/// A 64-bit value. -/// \returns A 128-bit integer vector. The lower 64 bits contain the value from -/// the operand. The upper 64 bits are assigned zeros. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_movpi64_epi64(__m64 __a) -{ - return __extension__ (__m128i)(__v2di){ (long long)__a, 0 }; -} - -/// Moves the lower 64 bits of a 128-bit integer vector to a 128-bit -/// integer vector, zeroing the upper bits. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVQ / MOVQ instruction. -/// -/// \param __a -/// A 128-bit integer vector operand. The lower 64 bits are moved to the -/// destination. -/// \returns A 128-bit integer vector. The lower 64 bits contain the value from -/// the operand. The upper 64 bits are assigned zeros. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_move_epi64(__m128i __a) -{ - return __builtin_shufflevector((__v2di)__a, _mm_setzero_si128(), 0, 2); -} - -/// Unpacks the high-order 64-bit elements from two 128-bit vectors of -/// [2 x double] and interleaves them into a 128-bit vector of [2 x -/// double]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VUNPCKHPD / UNPCKHPD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. \n -/// Bits [127:64] are written to bits [63:0] of the destination. -/// \param __b -/// A 128-bit vector of [2 x double]. \n -/// Bits [127:64] are written to bits [127:64] of the destination. -/// \returns A 128-bit vector of [2 x double] containing the interleaved values. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_unpackhi_pd(__m128d __a, __m128d __b) -{ - return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 1, 2+1); -} - -/// Unpacks the low-order 64-bit elements from two 128-bit vectors -/// of [2 x double] and interleaves them into a 128-bit vector of [2 x -/// double]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VUNPCKLPD / UNPCKLPD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. \n -/// Bits [63:0] are written to bits [63:0] of the destination. -/// \param __b -/// A 128-bit vector of [2 x double]. \n -/// Bits [63:0] are written to bits [127:64] of the destination. -/// \returns A 128-bit vector of [2 x double] containing the interleaved values. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_unpacklo_pd(__m128d __a, __m128d __b) -{ - return __builtin_shufflevector((__v2df)__a, (__v2df)__b, 0, 2+0); -} - -/// Extracts the sign bits of the double-precision values in the 128-bit -/// vector of [2 x double], zero-extends the value, and writes it to the -/// low-order bits of the destination. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVMSKPD / MOVMSKPD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double] containing the values with sign bits to -/// be extracted. -/// \returns The sign bits from each of the double-precision elements in \a __a, -/// written to bits [1:0]. The remaining bits are assigned values of zero. -static __inline__ int __DEFAULT_FN_ATTRS -_mm_movemask_pd(__m128d __a) -{ - return __builtin_ia32_movmskpd((__v2df)__a); -} - - -/// Constructs a 128-bit floating-point vector of [2 x double] from two -/// 128-bit vector parameters of [2 x double], using the immediate-value -/// parameter as a specifier. -/// -/// \headerfile -/// -/// \code -/// __m128d _mm_shuffle_pd(__m128d a, __m128d b, const int i); -/// \endcode -/// -/// This intrinsic corresponds to the VSHUFPD / SHUFPD instruction. -/// -/// \param a -/// A 128-bit vector of [2 x double]. -/// \param b -/// A 128-bit vector of [2 x double]. -/// \param i -/// An 8-bit immediate value. The least significant two bits specify which -/// elements to copy from \a a and \a b: \n -/// Bit[0] = 0: lower element of \a a copied to lower element of result. \n -/// Bit[0] = 1: upper element of \a a copied to lower element of result. \n -/// Bit[1] = 0: lower element of \a b copied to upper element of result. \n -/// Bit[1] = 1: upper element of \a b copied to upper element of result. \n -/// \returns A 128-bit vector of [2 x double] containing the shuffled values. -#define _mm_shuffle_pd(a, b, i) \ - ((__m128d)__builtin_ia32_shufpd((__v2df)(__m128d)(a), (__v2df)(__m128d)(b), \ - (int)(i))) - -/// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit -/// floating-point vector of [4 x float]. -/// -/// \headerfile -/// -/// This intrinsic has no corresponding instruction. -/// -/// \param __a -/// A 128-bit floating-point vector of [2 x double]. -/// \returns A 128-bit floating-point vector of [4 x float] containing the same -/// bitwise pattern as the parameter. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_castpd_ps(__m128d __a) -{ - return (__m128)__a; -} - -/// Casts a 128-bit floating-point vector of [2 x double] into a 128-bit -/// integer vector. -/// -/// \headerfile -/// -/// This intrinsic has no corresponding instruction. -/// -/// \param __a -/// A 128-bit floating-point vector of [2 x double]. -/// \returns A 128-bit integer vector containing the same bitwise pattern as the -/// parameter. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_castpd_si128(__m128d __a) -{ - return (__m128i)__a; -} - -/// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit -/// floating-point vector of [2 x double]. -/// -/// \headerfile -/// -/// This intrinsic has no corresponding instruction. -/// -/// \param __a -/// A 128-bit floating-point vector of [4 x float]. -/// \returns A 128-bit floating-point vector of [2 x double] containing the same -/// bitwise pattern as the parameter. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_castps_pd(__m128 __a) -{ - return (__m128d)__a; -} - -/// Casts a 128-bit floating-point vector of [4 x float] into a 128-bit -/// integer vector. -/// -/// \headerfile -/// -/// This intrinsic has no corresponding instruction. -/// -/// \param __a -/// A 128-bit floating-point vector of [4 x float]. -/// \returns A 128-bit integer vector containing the same bitwise pattern as the -/// parameter. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_castps_si128(__m128 __a) -{ - return (__m128i)__a; -} - -/// Casts a 128-bit integer vector into a 128-bit floating-point vector -/// of [4 x float]. -/// -/// \headerfile -/// -/// This intrinsic has no corresponding instruction. -/// -/// \param __a -/// A 128-bit integer vector. -/// \returns A 128-bit floating-point vector of [4 x float] containing the same -/// bitwise pattern as the parameter. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_castsi128_ps(__m128i __a) -{ - return (__m128)__a; -} - -/// Casts a 128-bit integer vector into a 128-bit floating-point vector -/// of [2 x double]. -/// -/// \headerfile -/// -/// This intrinsic has no corresponding instruction. -/// -/// \param __a -/// A 128-bit integer vector. -/// \returns A 128-bit floating-point vector of [2 x double] containing the same -/// bitwise pattern as the parameter. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_castsi128_pd(__m128i __a) -{ - return (__m128d)__a; -} - -#if defined(__cplusplus) -extern "C" { -#endif - -/// Indicates that a spin loop is being executed for the purposes of -/// optimizing power consumption during the loop. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PAUSE instruction. -/// -void _mm_pause(void); - -#if defined(__cplusplus) -} // extern "C" -#endif -#undef __DEFAULT_FN_ATTRS -#undef __DEFAULT_FN_ATTRS_MMX - -#define _MM_SHUFFLE2(x, y) (((x) << 1) | (y)) - -#define _MM_DENORMALS_ZERO_ON (0x0040U) -#define _MM_DENORMALS_ZERO_OFF (0x0000U) - -#define _MM_DENORMALS_ZERO_MASK (0x0040U) - -#define _MM_GET_DENORMALS_ZERO_MODE() (_mm_getcsr() & _MM_DENORMALS_ZERO_MASK) -#define _MM_SET_DENORMALS_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x))) - -#endif /* __EMMINTRIN_H */ diff --git a/include/enqcmdintrin.h b/include/enqcmdintrin.h deleted file mode 100644 index 30af67f..0000000 --- a/include/enqcmdintrin.h +++ /dev/null @@ -1,63 +0,0 @@ -/*===------------------ enqcmdintrin.h - enqcmd intrinsics -----------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ - -#ifndef __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __ENQCMDINTRIN_H -#define __ENQCMDINTRIN_H - -/* Define the default attributes for the functions in this file */ -#define _DEFAULT_FN_ATTRS \ - __attribute__((__always_inline__, __nodebug__, __target__("enqcmd"))) - -/// Reads 64-byte command pointed by \a __src, formats 64-byte enqueue store -/// data, and performs 64-byte enqueue store to memory pointed by \a __dst. -/// This intrinsics may only be used in User mode. -/// -/// \headerfile -/// -/// This intrinsics corresponds to the ENQCMD instruction. -/// -/// \param __dst -/// Pointer to the destination of the enqueue store. -/// \param __src -/// Pointer to 64-byte command data. -/// \returns If the command data is successfully written to \a __dst then 0 is -/// returned. Otherwise 1 is returned. -static __inline__ int _DEFAULT_FN_ATTRS -_enqcmd (void *__dst, const void *__src) -{ - return __builtin_ia32_enqcmd(__dst, __src); -} - -/// Reads 64-byte command pointed by \a __src, formats 64-byte enqueue store -/// data, and performs 64-byte enqueue store to memory pointed by \a __dst -/// This intrinsic may only be used in Privileged mode. -/// -/// \headerfile -/// -/// This intrinsics corresponds to the ENQCMDS instruction. -/// -/// \param __dst -/// Pointer to the destination of the enqueue store. -/// \param __src -/// Pointer to 64-byte command data. -/// \returns If the command data is successfully written to \a __dst then 0 is -/// returned. Otherwise 1 is returned. -static __inline__ int _DEFAULT_FN_ATTRS -_enqcmds (void *__dst, const void *__src) -{ - return __builtin_ia32_enqcmds(__dst, __src); -} - -#undef _DEFAULT_FN_ATTRS - -#endif /* __ENQCMDINTRIN_H */ diff --git a/include/f16cintrin.h b/include/f16cintrin.h deleted file mode 100644 index 13905e6..0000000 --- a/include/f16cintrin.h +++ /dev/null @@ -1,162 +0,0 @@ -/*===---- f16cintrin.h - F16C intrinsics -----------------------------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ - -#if !defined __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __F16CINTRIN_H -#define __F16CINTRIN_H - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS128 \ - __attribute__((__always_inline__, __nodebug__, __target__("f16c"), __min_vector_width__(128))) -#define __DEFAULT_FN_ATTRS256 \ - __attribute__((__always_inline__, __nodebug__, __target__("f16c"), __min_vector_width__(256))) - -/* NOTE: Intel documents the 128-bit versions of these as being in emmintrin.h, - * but that's because icc can emulate these without f16c using a library call. - * Since we don't do that let's leave these in f16cintrin.h. - */ - -/// Converts a 16-bit half-precision float value into a 32-bit float -/// value. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCVTPH2PS instruction. -/// -/// \param __a -/// A 16-bit half-precision float value. -/// \returns The converted 32-bit float value. -static __inline float __DEFAULT_FN_ATTRS128 -_cvtsh_ss(unsigned short __a) -{ - __v8hi __v = {(short)__a, 0, 0, 0, 0, 0, 0, 0}; - __v4sf __r = __builtin_ia32_vcvtph2ps(__v); - return __r[0]; -} - -/// Converts a 32-bit single-precision float value to a 16-bit -/// half-precision float value. -/// -/// \headerfile -/// -/// \code -/// unsigned short _cvtss_sh(float a, const int imm); -/// \endcode -/// -/// This intrinsic corresponds to the VCVTPS2PH instruction. -/// -/// \param a -/// A 32-bit single-precision float value to be converted to a 16-bit -/// half-precision float value. -/// \param imm -/// An immediate value controlling rounding using bits [2:0]: \n -/// 000: Nearest \n -/// 001: Down \n -/// 010: Up \n -/// 011: Truncate \n -/// 1XX: Use MXCSR.RC for rounding -/// \returns The converted 16-bit half-precision float value. -#define _cvtss_sh(a, imm) \ - ((unsigned short)(((__v8hi)__builtin_ia32_vcvtps2ph((__v4sf){a, 0, 0, 0}, \ - (imm)))[0])) - -/// Converts a 128-bit vector containing 32-bit float values into a -/// 128-bit vector containing 16-bit half-precision float values. -/// -/// \headerfile -/// -/// \code -/// __m128i _mm_cvtps_ph(__m128 a, const int imm); -/// \endcode -/// -/// This intrinsic corresponds to the VCVTPS2PH instruction. -/// -/// \param a -/// A 128-bit vector containing 32-bit float values. -/// \param imm -/// An immediate value controlling rounding using bits [2:0]: \n -/// 000: Nearest \n -/// 001: Down \n -/// 010: Up \n -/// 011: Truncate \n -/// 1XX: Use MXCSR.RC for rounding -/// \returns A 128-bit vector containing converted 16-bit half-precision float -/// values. The lower 64 bits are used to store the converted 16-bit -/// half-precision floating-point values. -#define _mm_cvtps_ph(a, imm) \ - ((__m128i)__builtin_ia32_vcvtps2ph((__v4sf)(__m128)(a), (imm))) - -/// Converts a 128-bit vector containing 16-bit half-precision float -/// values into a 128-bit vector containing 32-bit float values. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCVTPH2PS instruction. -/// -/// \param __a -/// A 128-bit vector containing 16-bit half-precision float values. The lower -/// 64 bits are used in the conversion. -/// \returns A 128-bit vector of [4 x float] containing converted float values. -static __inline __m128 __DEFAULT_FN_ATTRS128 -_mm_cvtph_ps(__m128i __a) -{ - return (__m128)__builtin_ia32_vcvtph2ps((__v8hi)__a); -} - -/// Converts a 256-bit vector of [8 x float] into a 128-bit vector -/// containing 16-bit half-precision float values. -/// -/// \headerfile -/// -/// \code -/// __m128i _mm256_cvtps_ph(__m256 a, const int imm); -/// \endcode -/// -/// This intrinsic corresponds to the VCVTPS2PH instruction. -/// -/// \param a -/// A 256-bit vector containing 32-bit single-precision float values to be -/// converted to 16-bit half-precision float values. -/// \param imm -/// An immediate value controlling rounding using bits [2:0]: \n -/// 000: Nearest \n -/// 001: Down \n -/// 010: Up \n -/// 011: Truncate \n -/// 1XX: Use MXCSR.RC for rounding -/// \returns A 128-bit vector containing the converted 16-bit half-precision -/// float values. -#define _mm256_cvtps_ph(a, imm) \ - ((__m128i)__builtin_ia32_vcvtps2ph256((__v8sf)(__m256)(a), (imm))) - -/// Converts a 128-bit vector containing 16-bit half-precision float -/// values into a 256-bit vector of [8 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCVTPH2PS instruction. -/// -/// \param __a -/// A 128-bit vector containing 16-bit half-precision float values to be -/// converted to 32-bit single-precision float values. -/// \returns A vector of [8 x float] containing the converted 32-bit -/// single-precision float values. -static __inline __m256 __DEFAULT_FN_ATTRS256 -_mm256_cvtph_ps(__m128i __a) -{ - return (__m256)__builtin_ia32_vcvtph2ps256((__v8hi)__a); -} - -#undef __DEFAULT_FN_ATTRS128 -#undef __DEFAULT_FN_ATTRS256 - -#endif /* __F16CINTRIN_H */ diff --git a/include/fma4intrin.h b/include/fma4intrin.h deleted file mode 100644 index 694801b..0000000 --- a/include/fma4intrin.h +++ /dev/null @@ -1,218 +0,0 @@ -/*===---- fma4intrin.h - FMA4 intrinsics -----------------------------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ - -#ifndef __X86INTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __FMA4INTRIN_H -#define __FMA4INTRIN_H - -#include - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("fma4"), __min_vector_width__(128))) -#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("fma4"), __min_vector_width__(256))) - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_macc_ps(__m128 __A, __m128 __B, __m128 __C) -{ - return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_macc_pd(__m128d __A, __m128d __B, __m128d __C) -{ - return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_macc_ss(__m128 __A, __m128 __B, __m128 __C) -{ - return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_macc_sd(__m128d __A, __m128d __B, __m128d __C) -{ - return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, (__v2df)__C); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_msub_ps(__m128 __A, __m128 __B, __m128 __C) -{ - return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_msub_pd(__m128d __A, __m128d __B, __m128d __C) -{ - return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_msub_ss(__m128 __A, __m128 __B, __m128 __C) -{ - return (__m128)__builtin_ia32_vfmaddss((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_msub_sd(__m128d __A, __m128d __B, __m128d __C) -{ - return (__m128d)__builtin_ia32_vfmaddsd((__v2df)__A, (__v2df)__B, -(__v2df)__C); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_nmacc_ps(__m128 __A, __m128 __B, __m128 __C) -{ - return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_nmacc_pd(__m128d __A, __m128d __B, __m128d __C) -{ - return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_nmacc_ss(__m128 __A, __m128 __B, __m128 __C) -{ - return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_nmacc_sd(__m128d __A, __m128d __B, __m128d __C) -{ - return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, (__v2df)__C); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_nmsub_ps(__m128 __A, __m128 __B, __m128 __C) -{ - return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_nmsub_pd(__m128d __A, __m128d __B, __m128d __C) -{ - return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_nmsub_ss(__m128 __A, __m128 __B, __m128 __C) -{ - return (__m128)__builtin_ia32_vfmaddss(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_nmsub_sd(__m128d __A, __m128d __B, __m128d __C) -{ - return (__m128d)__builtin_ia32_vfmaddsd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_maddsub_ps(__m128 __A, __m128 __B, __m128 __C) -{ - return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_maddsub_pd(__m128d __A, __m128d __B, __m128d __C) -{ - return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_msubadd_ps(__m128 __A, __m128 __B, __m128 __C) -{ - return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_msubadd_pd(__m128d __A, __m128d __B, __m128d __C) -{ - return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_macc_ps(__m256 __A, __m256 __B, __m256 __C) -{ - return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_macc_pd(__m256d __A, __m256d __B, __m256d __C) -{ - return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_msub_ps(__m256 __A, __m256 __B, __m256 __C) -{ - return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_msub_pd(__m256d __A, __m256d __B, __m256d __C) -{ - return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_nmacc_ps(__m256 __A, __m256 __B, __m256 __C) -{ - return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_nmacc_pd(__m256d __A, __m256d __B, __m256d __C) -{ - return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_nmsub_ps(__m256 __A, __m256 __B, __m256 __C) -{ - return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_nmsub_pd(__m256d __A, __m256d __B, __m256d __C) -{ - return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_maddsub_ps(__m256 __A, __m256 __B, __m256 __C) -{ - return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_maddsub_pd(__m256d __A, __m256d __B, __m256d __C) -{ - return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_msubadd_ps(__m256 __A, __m256 __B, __m256 __C) -{ - return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_msubadd_pd(__m256d __A, __m256d __B, __m256d __C) -{ - return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C); -} - -#undef __DEFAULT_FN_ATTRS128 -#undef __DEFAULT_FN_ATTRS256 - -#endif /* __FMA4INTRIN_H */ diff --git a/include/fmaintrin.h b/include/fmaintrin.h deleted file mode 100644 index d889b7c..0000000 --- a/include/fmaintrin.h +++ /dev/null @@ -1,216 +0,0 @@ -/*===---- fmaintrin.h - FMA intrinsics -------------------------------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ - -#ifndef __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __FMAINTRIN_H -#define __FMAINTRIN_H - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS128 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(128))) -#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("fma"), __min_vector_width__(256))) - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_fmadd_ps(__m128 __A, __m128 __B, __m128 __C) -{ - return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_fmadd_pd(__m128d __A, __m128d __B, __m128d __C) -{ - return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, (__v2df)__C); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_fmadd_ss(__m128 __A, __m128 __B, __m128 __C) -{ - return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_fmadd_sd(__m128d __A, __m128d __B, __m128d __C) -{ - return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, (__v2df)__C); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_fmsub_ps(__m128 __A, __m128 __B, __m128 __C) -{ - return (__m128)__builtin_ia32_vfmaddps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_fmsub_pd(__m128d __A, __m128d __B, __m128d __C) -{ - return (__m128d)__builtin_ia32_vfmaddpd((__v2df)__A, (__v2df)__B, -(__v2df)__C); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_fmsub_ss(__m128 __A, __m128 __B, __m128 __C) -{ - return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_fmsub_sd(__m128d __A, __m128d __B, __m128d __C) -{ - return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, (__v2df)__B, -(__v2df)__C); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_fnmadd_ps(__m128 __A, __m128 __B, __m128 __C) -{ - return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, (__v4sf)__C); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_fnmadd_pd(__m128d __A, __m128d __B, __m128d __C) -{ - return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, (__v2df)__C); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_fnmadd_ss(__m128 __A, __m128 __B, __m128 __C) -{ - return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, (__v4sf)__C); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_fnmadd_sd(__m128d __A, __m128d __B, __m128d __C) -{ - return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, (__v2df)__C); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_fnmsub_ps(__m128 __A, __m128 __B, __m128 __C) -{ - return (__m128)__builtin_ia32_vfmaddps(-(__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_fnmsub_pd(__m128d __A, __m128d __B, __m128d __C) -{ - return (__m128d)__builtin_ia32_vfmaddpd(-(__v2df)__A, (__v2df)__B, -(__v2df)__C); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_fnmsub_ss(__m128 __A, __m128 __B, __m128 __C) -{ - return (__m128)__builtin_ia32_vfmaddss3((__v4sf)__A, -(__v4sf)__B, -(__v4sf)__C); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_fnmsub_sd(__m128d __A, __m128d __B, __m128d __C) -{ - return (__m128d)__builtin_ia32_vfmaddsd3((__v2df)__A, -(__v2df)__B, -(__v2df)__C); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_fmaddsub_ps(__m128 __A, __m128 __B, __m128 __C) -{ - return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, (__v4sf)__C); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_fmaddsub_pd(__m128d __A, __m128d __B, __m128d __C) -{ - return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, (__v2df)__C); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS128 -_mm_fmsubadd_ps(__m128 __A, __m128 __B, __m128 __C) -{ - return (__m128)__builtin_ia32_vfmaddsubps((__v4sf)__A, (__v4sf)__B, -(__v4sf)__C); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS128 -_mm_fmsubadd_pd(__m128d __A, __m128d __B, __m128d __C) -{ - return (__m128d)__builtin_ia32_vfmaddsubpd((__v2df)__A, (__v2df)__B, -(__v2df)__C); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_fmadd_ps(__m256 __A, __m256 __B, __m256 __C) -{ - return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_fmadd_pd(__m256d __A, __m256d __B, __m256d __C) -{ - return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, (__v4df)__C); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_fmsub_ps(__m256 __A, __m256 __B, __m256 __C) -{ - return (__m256)__builtin_ia32_vfmaddps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_fmsub_pd(__m256d __A, __m256d __B, __m256d __C) -{ - return (__m256d)__builtin_ia32_vfmaddpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_fnmadd_ps(__m256 __A, __m256 __B, __m256 __C) -{ - return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, (__v8sf)__C); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_fnmadd_pd(__m256d __A, __m256d __B, __m256d __C) -{ - return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, (__v4df)__C); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_fnmsub_ps(__m256 __A, __m256 __B, __m256 __C) -{ - return (__m256)__builtin_ia32_vfmaddps256(-(__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_fnmsub_pd(__m256d __A, __m256d __B, __m256d __C) -{ - return (__m256d)__builtin_ia32_vfmaddpd256(-(__v4df)__A, (__v4df)__B, -(__v4df)__C); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_fmaddsub_ps(__m256 __A, __m256 __B, __m256 __C) -{ - return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, (__v8sf)__C); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_fmaddsub_pd(__m256d __A, __m256d __B, __m256d __C) -{ - return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, (__v4df)__C); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_fmsubadd_ps(__m256 __A, __m256 __B, __m256 __C) -{ - return (__m256)__builtin_ia32_vfmaddsubps256((__v8sf)__A, (__v8sf)__B, -(__v8sf)__C); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_fmsubadd_pd(__m256d __A, __m256d __B, __m256d __C) -{ - return (__m256d)__builtin_ia32_vfmaddsubpd256((__v4df)__A, (__v4df)__B, -(__v4df)__C); -} - -#undef __DEFAULT_FN_ATTRS128 -#undef __DEFAULT_FN_ATTRS256 - -#endif /* __FMAINTRIN_H */ diff --git a/include/fxsrintrin.h b/include/fxsrintrin.h deleted file mode 100644 index afee6aa..0000000 --- a/include/fxsrintrin.h +++ /dev/null @@ -1,91 +0,0 @@ -/*===---- fxsrintrin.h - FXSR intrinsic ------------------------------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ - -#ifndef __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __FXSRINTRIN_H -#define __FXSRINTRIN_H - -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("fxsr"))) - -/// Saves the XMM, MMX, MXCSR and x87 FPU registers into a 512-byte -/// memory region pointed to by the input parameter \a __p. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the FXSAVE instruction. -/// -/// \param __p -/// A pointer to a 512-byte memory region. The beginning of this memory -/// region should be aligned on a 16-byte boundary. -static __inline__ void __DEFAULT_FN_ATTRS -_fxsave(void *__p) -{ - __builtin_ia32_fxsave(__p); -} - -/// Restores the XMM, MMX, MXCSR and x87 FPU registers from the 512-byte -/// memory region pointed to by the input parameter \a __p. The contents of -/// this memory region should have been written to by a previous \c _fxsave -/// or \c _fxsave64 intrinsic. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the FXRSTOR instruction. -/// -/// \param __p -/// A pointer to a 512-byte memory region. The beginning of this memory -/// region should be aligned on a 16-byte boundary. -static __inline__ void __DEFAULT_FN_ATTRS -_fxrstor(void *__p) -{ - __builtin_ia32_fxrstor(__p); -} - -#ifdef __x86_64__ -/// Saves the XMM, MMX, MXCSR and x87 FPU registers into a 512-byte -/// memory region pointed to by the input parameter \a __p. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the FXSAVE64 instruction. -/// -/// \param __p -/// A pointer to a 512-byte memory region. The beginning of this memory -/// region should be aligned on a 16-byte boundary. -static __inline__ void __DEFAULT_FN_ATTRS -_fxsave64(void *__p) -{ - __builtin_ia32_fxsave64(__p); -} - -/// Restores the XMM, MMX, MXCSR and x87 FPU registers from the 512-byte -/// memory region pointed to by the input parameter \a __p. The contents of -/// this memory region should have been written to by a previous \c _fxsave -/// or \c _fxsave64 intrinsic. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the FXRSTOR64 instruction. -/// -/// \param __p -/// A pointer to a 512-byte memory region. The beginning of this memory -/// region should be aligned on a 16-byte boundary. -static __inline__ void __DEFAULT_FN_ATTRS -_fxrstor64(void *__p) -{ - __builtin_ia32_fxrstor64(__p); -} -#endif - -#undef __DEFAULT_FN_ATTRS - -#endif diff --git a/include/gfniintrin.h b/include/gfniintrin.h deleted file mode 100644 index a59238b..0000000 --- a/include/gfniintrin.h +++ /dev/null @@ -1,192 +0,0 @@ -/*===----------------- gfniintrin.h - GFNI intrinsics ----------------------=== - * - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ -#ifndef __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __GFNIINTRIN_H -#define __GFNIINTRIN_H - -/* Default attributes for simple form (no masking). */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("gfni"), __min_vector_width__(128))) - -/* Default attributes for YMM unmasked form. */ -#define __DEFAULT_FN_ATTRS_Y __attribute__((__always_inline__, __nodebug__, __target__("avx,gfni"), __min_vector_width__(256))) - -/* Default attributes for ZMM forms. */ -#define __DEFAULT_FN_ATTRS_Z __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,gfni"), __min_vector_width__(512))) - -/* Default attributes for VLX forms. */ -#define __DEFAULT_FN_ATTRS_VL128 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni"), __min_vector_width__(128))) -#define __DEFAULT_FN_ATTRS_VL256 __attribute__((__always_inline__, __nodebug__, __target__("avx512bw,avx512vl,gfni"), __min_vector_width__(256))) - -#define _mm_gf2p8affineinv_epi64_epi8(A, B, I) \ - ((__m128i)__builtin_ia32_vgf2p8affineinvqb_v16qi((__v16qi)(__m128i)(A), \ - (__v16qi)(__m128i)(B), \ - (char)(I))) - -#define _mm_gf2p8affine_epi64_epi8(A, B, I) \ - ((__m128i)__builtin_ia32_vgf2p8affineqb_v16qi((__v16qi)(__m128i)(A), \ - (__v16qi)(__m128i)(B), \ - (char)(I))) - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_gf2p8mul_epi8(__m128i __A, __m128i __B) -{ - return (__m128i) __builtin_ia32_vgf2p8mulb_v16qi((__v16qi) __A, - (__v16qi) __B); -} - -#ifdef __AVXINTRIN_H -#define _mm256_gf2p8affineinv_epi64_epi8(A, B, I) \ - ((__m256i)__builtin_ia32_vgf2p8affineinvqb_v32qi((__v32qi)(__m256i)(A), \ - (__v32qi)(__m256i)(B), \ - (char)(I))) - -#define _mm256_gf2p8affine_epi64_epi8(A, B, I) \ - ((__m256i)__builtin_ia32_vgf2p8affineqb_v32qi((__v32qi)(__m256i)(A), \ - (__v32qi)(__m256i)(B), \ - (char)(I))) - -static __inline__ __m256i __DEFAULT_FN_ATTRS_Y -_mm256_gf2p8mul_epi8(__m256i __A, __m256i __B) -{ - return (__m256i) __builtin_ia32_vgf2p8mulb_v32qi((__v32qi) __A, - (__v32qi) __B); -} -#endif /* __AVXINTRIN_H */ - -#ifdef __AVX512BWINTRIN_H -#define _mm512_gf2p8affineinv_epi64_epi8(A, B, I) \ - ((__m512i)__builtin_ia32_vgf2p8affineinvqb_v64qi((__v64qi)(__m512i)(A), \ - (__v64qi)(__m512i)(B), \ - (char)(I))) - -#define _mm512_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \ - ((__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \ - (__v64qi)_mm512_gf2p8affineinv_epi64_epi8(A, B, I), \ - (__v64qi)(__m512i)(S))) - -#define _mm512_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \ - _mm512_mask_gf2p8affineinv_epi64_epi8((__m512i)_mm512_setzero_si512(), \ - U, A, B, I) - -#define _mm512_gf2p8affine_epi64_epi8(A, B, I) \ - ((__m512i)__builtin_ia32_vgf2p8affineqb_v64qi((__v64qi)(__m512i)(A), \ - (__v64qi)(__m512i)(B), \ - (char)(I))) - -#define _mm512_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \ - ((__m512i)__builtin_ia32_selectb_512((__mmask64)(U), \ - (__v64qi)_mm512_gf2p8affine_epi64_epi8((A), (B), (I)), \ - (__v64qi)(__m512i)(S))) - -#define _mm512_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \ - _mm512_mask_gf2p8affine_epi64_epi8((__m512i)_mm512_setzero_si512(), \ - U, A, B, I) - -static __inline__ __m512i __DEFAULT_FN_ATTRS_Z -_mm512_gf2p8mul_epi8(__m512i __A, __m512i __B) -{ - return (__m512i) __builtin_ia32_vgf2p8mulb_v64qi((__v64qi) __A, - (__v64qi) __B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS_Z -_mm512_mask_gf2p8mul_epi8(__m512i __S, __mmask64 __U, __m512i __A, __m512i __B) -{ - return (__m512i) __builtin_ia32_selectb_512(__U, - (__v64qi) _mm512_gf2p8mul_epi8(__A, __B), - (__v64qi) __S); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS_Z -_mm512_maskz_gf2p8mul_epi8(__mmask64 __U, __m512i __A, __m512i __B) -{ - return _mm512_mask_gf2p8mul_epi8((__m512i)_mm512_setzero_si512(), - __U, __A, __B); -} -#endif /* __AVX512BWINTRIN_H */ - -#ifdef __AVX512VLBWINTRIN_H -#define _mm_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \ - ((__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \ - (__v16qi)_mm_gf2p8affineinv_epi64_epi8(A, B, I), \ - (__v16qi)(__m128i)(S))) - -#define _mm_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \ - _mm_mask_gf2p8affineinv_epi64_epi8((__m128i)_mm_setzero_si128(), \ - U, A, B, I) - -#define _mm256_mask_gf2p8affineinv_epi64_epi8(S, U, A, B, I) \ - ((__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \ - (__v32qi)_mm256_gf2p8affineinv_epi64_epi8(A, B, I), \ - (__v32qi)(__m256i)(S))) - -#define _mm256_maskz_gf2p8affineinv_epi64_epi8(U, A, B, I) \ - _mm256_mask_gf2p8affineinv_epi64_epi8((__m256i)_mm256_setzero_si256(), \ - U, A, B, I) - -#define _mm_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \ - ((__m128i)__builtin_ia32_selectb_128((__mmask16)(U), \ - (__v16qi)_mm_gf2p8affine_epi64_epi8(A, B, I), \ - (__v16qi)(__m128i)(S))) - -#define _mm_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \ - _mm_mask_gf2p8affine_epi64_epi8((__m128i)_mm_setzero_si128(), U, A, B, I) - -#define _mm256_mask_gf2p8affine_epi64_epi8(S, U, A, B, I) \ - ((__m256i)__builtin_ia32_selectb_256((__mmask32)(U), \ - (__v32qi)_mm256_gf2p8affine_epi64_epi8(A, B, I), \ - (__v32qi)(__m256i)(S))) - -#define _mm256_maskz_gf2p8affine_epi64_epi8(U, A, B, I) \ - _mm256_mask_gf2p8affine_epi64_epi8((__m256i)_mm256_setzero_si256(), \ - U, A, B, I) - -static __inline__ __m128i __DEFAULT_FN_ATTRS_VL128 -_mm_mask_gf2p8mul_epi8(__m128i __S, __mmask16 __U, __m128i __A, __m128i __B) -{ - return (__m128i) __builtin_ia32_selectb_128(__U, - (__v16qi) _mm_gf2p8mul_epi8(__A, __B), - (__v16qi) __S); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS_VL128 -_mm_maskz_gf2p8mul_epi8(__mmask16 __U, __m128i __A, __m128i __B) -{ - return _mm_mask_gf2p8mul_epi8((__m128i)_mm_setzero_si128(), - __U, __A, __B); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS_VL256 -_mm256_mask_gf2p8mul_epi8(__m256i __S, __mmask32 __U, __m256i __A, __m256i __B) -{ - return (__m256i) __builtin_ia32_selectb_256(__U, - (__v32qi) _mm256_gf2p8mul_epi8(__A, __B), - (__v32qi) __S); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS_VL256 -_mm256_maskz_gf2p8mul_epi8(__mmask32 __U, __m256i __A, __m256i __B) -{ - return _mm256_mask_gf2p8mul_epi8((__m256i)_mm256_setzero_si256(), - __U, __A, __B); -} -#endif /* __AVX512VLBWINTRIN_H */ - -#undef __DEFAULT_FN_ATTRS -#undef __DEFAULT_FN_ATTRS_Y -#undef __DEFAULT_FN_ATTRS_Z -#undef __DEFAULT_FN_ATTRS_VL128 -#undef __DEFAULT_FN_ATTRS_VL256 - -#endif /* __GFNIINTRIN_H */ - diff --git a/include/hresetintrin.h b/include/hresetintrin.h deleted file mode 100644 index 13e31a2..0000000 --- a/include/hresetintrin.h +++ /dev/null @@ -1,49 +0,0 @@ -/*===---------------- hresetintrin.h - HRESET intrinsics -------------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ -#ifndef __X86GPRINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __HRESETINTRIN_H -#define __HRESETINTRIN_H - -#if __has_extension(gnu_asm) - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS \ - __attribute__((__always_inline__, __nodebug__, __target__("hreset"))) - -/// Provides a hint to the processor to selectively reset the prediction -/// history of the current logical processor specified by a 32-bit integer -/// value \a __eax. -/// -/// This intrinsic corresponds to the HRESET instruction. -/// -/// \operation -/// IF __eax == 0 -/// // nop -/// ELSE -/// FOR i := 0 to 31 -/// IF __eax[i] -/// ResetPredictionFeature(i) -/// FI -/// ENDFOR -/// FI -/// \endoperation -static __inline void __DEFAULT_FN_ATTRS -_hreset(int __eax) -{ - __asm__ ("hreset $0" :: "a"(__eax)); -} - -#undef __DEFAULT_FN_ATTRS - -#endif /* __has_extension(gnu_asm) */ - -#endif /* __HRESETINTRIN_H */ diff --git a/include/ia32intrin.h b/include/ia32intrin.h deleted file mode 100644 index ec8142b..0000000 --- a/include/ia32intrin.h +++ /dev/null @@ -1,441 +0,0 @@ -/* ===-------- ia32intrin.h ---------------------------------------------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ - -#ifndef __X86INTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __IA32INTRIN_H -#define __IA32INTRIN_H - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__)) -#define __DEFAULT_FN_ATTRS_CRC32 __attribute__((__always_inline__, __nodebug__, __target__("crc32"))) - -#if defined(__cplusplus) && (__cplusplus >= 201103L) -#define __DEFAULT_FN_ATTRS_CAST __attribute__((__always_inline__)) constexpr -#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr -#else -#define __DEFAULT_FN_ATTRS_CAST __attribute__((__always_inline__)) -#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS -#endif - -/** Find the first set bit starting from the lsb. Result is undefined if - * input is 0. - * - * \headerfile - * - * This intrinsic corresponds to the BSF instruction or the - * TZCNT instruction. - * - * \param __A - * A 32-bit integer operand. - * \returns A 32-bit integer containing the bit number. - */ -static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR -__bsfd(int __A) { - return __builtin_ctz(__A); -} - -/** Find the first set bit starting from the msb. Result is undefined if - * input is 0. - * - * \headerfile - * - * This intrinsic corresponds to the BSR instruction or the - * LZCNT instruction and an XOR . - * - * \param __A - * A 32-bit integer operand. - * \returns A 32-bit integer containing the bit number. - */ -static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR -__bsrd(int __A) { - return 31 - __builtin_clz(__A); -} - -/** Swaps the bytes in the input. Converting little endian to big endian or - * vice versa. - * - * \headerfile - * - * This intrinsic corresponds to the BSWAP instruction. - * - * \param __A - * A 32-bit integer operand. - * \returns A 32-bit integer containing the swapped bytes. - */ -static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR -__bswapd(int __A) { - return __builtin_bswap32(__A); -} - -static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR -_bswap(int __A) { - return __builtin_bswap32(__A); -} - -#define _bit_scan_forward(A) __bsfd((A)) -#define _bit_scan_reverse(A) __bsrd((A)) - -#ifdef __x86_64__ -/** Find the first set bit starting from the lsb. Result is undefined if - * input is 0. - * - * \headerfile - * - * This intrinsic corresponds to the BSF instruction or the - * TZCNT instruction. - * - * \param __A - * A 64-bit integer operand. - * \returns A 32-bit integer containing the bit number. - */ -static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR -__bsfq(long long __A) { - return __builtin_ctzll(__A); -} - -/** Find the first set bit starting from the msb. Result is undefined if - * input is 0. - * - * \headerfile - * - * This intrinsic corresponds to the BSR instruction or the - * LZCNT instruction and an XOR . - * - * \param __A - * A 64-bit integer operand. - * \returns A 32-bit integer containing the bit number. - */ -static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR -__bsrq(long long __A) { - return 63 - __builtin_clzll(__A); -} - -/** Swaps the bytes in the input. Converting little endian to big endian or - * vice versa. - * - * \headerfile - * - * This intrinsic corresponds to the BSWAP instruction. - * - * \param __A - * A 64-bit integer operand. - * \returns A 64-bit integer containing the swapped bytes. - */ -static __inline__ long long __DEFAULT_FN_ATTRS_CONSTEXPR -__bswapq(long long __A) { - return __builtin_bswap64(__A); -} - -#define _bswap64(A) __bswapq((A)) -#endif - -/** Counts the number of bits in the source operand having a value of 1. - * - * \headerfile - * - * This intrinsic corresponds to the POPCNT instruction or a - * a sequence of arithmetic and logic ops to calculate it. - * - * \param __A - * An unsigned 32-bit integer operand. - * \returns A 32-bit integer containing the number of bits with value 1 in the - * source operand. - */ -static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR -__popcntd(unsigned int __A) -{ - return __builtin_popcount(__A); -} - -#define _popcnt32(A) __popcntd((A)) - -#ifdef __x86_64__ -/** Counts the number of bits in the source operand having a value of 1. - * - * \headerfile - * - * This intrinsic corresponds to the POPCNT instruction or a - * a sequence of arithmetic and logic ops to calculate it. - * - * \param __A - * An unsigned 64-bit integer operand. - * \returns A 64-bit integer containing the number of bits with value 1 in the - * source operand. - */ -static __inline__ long long __DEFAULT_FN_ATTRS_CONSTEXPR -__popcntq(unsigned long long __A) -{ - return __builtin_popcountll(__A); -} - -#define _popcnt64(A) __popcntq((A)) -#endif /* __x86_64__ */ - -#ifdef __x86_64__ -static __inline__ unsigned long long __DEFAULT_FN_ATTRS -__readeflags(void) -{ - return __builtin_ia32_readeflags_u64(); -} - -static __inline__ void __DEFAULT_FN_ATTRS -__writeeflags(unsigned long long __f) -{ - __builtin_ia32_writeeflags_u64(__f); -} - -#else /* !__x86_64__ */ -static __inline__ unsigned int __DEFAULT_FN_ATTRS -__readeflags(void) -{ - return __builtin_ia32_readeflags_u32(); -} - -static __inline__ void __DEFAULT_FN_ATTRS -__writeeflags(unsigned int __f) -{ - __builtin_ia32_writeeflags_u32(__f); -} -#endif /* !__x86_64__ */ - -/** Cast a 32-bit float value to a 32-bit unsigned integer value - * - * \headerfile - * This intrinsic corresponds to the VMOVD / MOVD instruction in x86_64, - * and corresponds to the VMOVL / MOVL instruction in ia32. - * - * \param __A - * A 32-bit float value. - * \returns a 32-bit unsigned integer containing the converted value. - */ -static __inline__ unsigned int __DEFAULT_FN_ATTRS_CAST -_castf32_u32(float __A) { - return __builtin_bit_cast(unsigned int, __A); -} - -/** Cast a 64-bit float value to a 64-bit unsigned integer value - * - * \headerfile - * This intrinsic corresponds to the VMOVQ / MOVQ instruction in x86_64, - * and corresponds to the VMOVL / MOVL instruction in ia32. - * - * \param __A - * A 64-bit float value. - * \returns a 64-bit unsigned integer containing the converted value. - */ -static __inline__ unsigned long long __DEFAULT_FN_ATTRS_CAST -_castf64_u64(double __A) { - return __builtin_bit_cast(unsigned long long, __A); -} - -/** Cast a 32-bit unsigned integer value to a 32-bit float value - * - * \headerfile - * This intrinsic corresponds to the VMOVQ / MOVQ instruction in x86_64, - * and corresponds to the FLDS instruction in ia32. - * - * \param __A - * A 32-bit unsigned integer value. - * \returns a 32-bit float value containing the converted value. - */ -static __inline__ float __DEFAULT_FN_ATTRS_CAST -_castu32_f32(unsigned int __A) { - return __builtin_bit_cast(float, __A); -} - -/** Cast a 64-bit unsigned integer value to a 64-bit float value - * - * \headerfile - * This intrinsic corresponds to the VMOVQ / MOVQ instruction in x86_64, - * and corresponds to the FLDL instruction in ia32. - * - * \param __A - * A 64-bit unsigned integer value. - * \returns a 64-bit float value containing the converted value. - */ -static __inline__ double __DEFAULT_FN_ATTRS_CAST -_castu64_f64(unsigned long long __A) { - return __builtin_bit_cast(double, __A); -} - -/** Adds the unsigned integer operand to the CRC-32C checksum of the - * unsigned char operand. - * - * \headerfile - * - * This intrinsic corresponds to the CRC32B instruction. - * - * \param __C - * An unsigned integer operand to add to the CRC-32C checksum of operand - * \a __D. - * \param __D - * An unsigned 8-bit integer operand used to compute the CRC-32C checksum. - * \returns The result of adding operand \a __C to the CRC-32C checksum of - * operand \a __D. - */ -static __inline__ unsigned int __DEFAULT_FN_ATTRS_CRC32 -__crc32b(unsigned int __C, unsigned char __D) -{ - return __builtin_ia32_crc32qi(__C, __D); -} - -/** Adds the unsigned integer operand to the CRC-32C checksum of the - * unsigned short operand. - * - * \headerfile - * - * This intrinsic corresponds to the CRC32W instruction. - * - * \param __C - * An unsigned integer operand to add to the CRC-32C checksum of operand - * \a __D. - * \param __D - * An unsigned 16-bit integer operand used to compute the CRC-32C checksum. - * \returns The result of adding operand \a __C to the CRC-32C checksum of - * operand \a __D. - */ -static __inline__ unsigned int __DEFAULT_FN_ATTRS_CRC32 -__crc32w(unsigned int __C, unsigned short __D) -{ - return __builtin_ia32_crc32hi(__C, __D); -} - -/** Adds the unsigned integer operand to the CRC-32C checksum of the - * second unsigned integer operand. - * - * \headerfile - * - * This intrinsic corresponds to the CRC32D instruction. - * - * \param __C - * An unsigned integer operand to add to the CRC-32C checksum of operand - * \a __D. - * \param __D - * An unsigned 32-bit integer operand used to compute the CRC-32C checksum. - * \returns The result of adding operand \a __C to the CRC-32C checksum of - * operand \a __D. - */ -static __inline__ unsigned int __DEFAULT_FN_ATTRS_CRC32 -__crc32d(unsigned int __C, unsigned int __D) -{ - return __builtin_ia32_crc32si(__C, __D); -} - -#ifdef __x86_64__ -/** Adds the unsigned integer operand to the CRC-32C checksum of the - * unsigned 64-bit integer operand. - * - * \headerfile - * - * This intrinsic corresponds to the CRC32Q instruction. - * - * \param __C - * An unsigned integer operand to add to the CRC-32C checksum of operand - * \a __D. - * \param __D - * An unsigned 64-bit integer operand used to compute the CRC-32C checksum. - * \returns The result of adding operand \a __C to the CRC-32C checksum of - * operand \a __D. - */ -static __inline__ unsigned long long __DEFAULT_FN_ATTRS_CRC32 -__crc32q(unsigned long long __C, unsigned long long __D) -{ - return __builtin_ia32_crc32di(__C, __D); -} -#endif /* __x86_64__ */ - -static __inline__ unsigned long long __DEFAULT_FN_ATTRS -__rdpmc(int __A) { - return __builtin_ia32_rdpmc(__A); -} - -/* __rdtscp */ -static __inline__ unsigned long long __DEFAULT_FN_ATTRS -__rdtscp(unsigned int *__A) { - return __builtin_ia32_rdtscp(__A); -} - -#define _rdtsc() __rdtsc() - -#define _rdpmc(A) __rdpmc(A) - -static __inline__ void __DEFAULT_FN_ATTRS -_wbinvd(void) { - __builtin_ia32_wbinvd(); -} - -static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR -__rolb(unsigned char __X, int __C) { - return __builtin_rotateleft8(__X, __C); -} - -static __inline__ unsigned char __DEFAULT_FN_ATTRS_CONSTEXPR -__rorb(unsigned char __X, int __C) { - return __builtin_rotateright8(__X, __C); -} - -static __inline__ unsigned short __DEFAULT_FN_ATTRS_CONSTEXPR -__rolw(unsigned short __X, int __C) { - return __builtin_rotateleft16(__X, __C); -} - -static __inline__ unsigned short __DEFAULT_FN_ATTRS_CONSTEXPR -__rorw(unsigned short __X, int __C) { - return __builtin_rotateright16(__X, __C); -} - -static __inline__ unsigned int __DEFAULT_FN_ATTRS_CONSTEXPR -__rold(unsigned int __X, int __C) { - return __builtin_rotateleft32(__X, __C); -} - -static __inline__ unsigned int __DEFAULT_FN_ATTRS_CONSTEXPR -__rord(unsigned int __X, int __C) { - return __builtin_rotateright32(__X, __C); -} - -#ifdef __x86_64__ -static __inline__ unsigned long long __DEFAULT_FN_ATTRS_CONSTEXPR -__rolq(unsigned long long __X, int __C) { - return __builtin_rotateleft64(__X, __C); -} - -static __inline__ unsigned long long __DEFAULT_FN_ATTRS_CONSTEXPR -__rorq(unsigned long long __X, int __C) { - return __builtin_rotateright64(__X, __C); -} -#endif /* __x86_64__ */ - -#ifndef _MSC_VER -/* These are already provided as builtins for MSVC. */ -/* Select the correct function based on the size of long. */ -#ifdef __LP64__ -#define _lrotl(a,b) __rolq((a), (b)) -#define _lrotr(a,b) __rorq((a), (b)) -#else -#define _lrotl(a,b) __rold((a), (b)) -#define _lrotr(a,b) __rord((a), (b)) -#endif -#define _rotl(a,b) __rold((a), (b)) -#define _rotr(a,b) __rord((a), (b)) -#endif // _MSC_VER - -/* These are not builtins so need to be provided in all modes. */ -#define _rotwl(a,b) __rolw((a), (b)) -#define _rotwr(a,b) __rorw((a), (b)) - -#undef __DEFAULT_FN_ATTRS -#undef __DEFAULT_FN_ATTRS_CAST -#undef __DEFAULT_FN_ATTRS_CRC32 -#undef __DEFAULT_FN_ATTRS_CONSTEXPR - -#endif /* __IA32INTRIN_H */ diff --git a/include/immintrin.h b/include/immintrin.h deleted file mode 100644 index e5174f8..0000000 --- a/include/immintrin.h +++ /dev/null @@ -1,618 +0,0 @@ -/*===---- immintrin.h - Intel intrinsics -----------------------------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ - -#ifndef __IMMINTRIN_H -#define __IMMINTRIN_H - -#if !defined(__i386__) && !defined(__x86_64__) -#error "This header is only meant to be used on x86 and x64 architecture" -#endif - -#include - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__MMX__) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__SSE__) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__SSE2__) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__SSE3__) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__SSSE3__) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - (defined(__SSE4_2__) || defined(__SSE4_1__)) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - (defined(__AES__) || defined(__PCLMUL__)) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__CLFLUSHOPT__) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__CLWB__) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__AVX__) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__AVX2__) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__F16C__) -#include -#endif - -/* No feature check desired due to internal checks */ -#include - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__BMI2__) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__LZCNT__) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__POPCNT__) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__FMA__) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__AVX512F__) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__AVX512VL__) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__AVX512BW__) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__AVX512BITALG__) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__AVX512CD__) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__AVX512VPOPCNTDQ__) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - (defined(__AVX512VL__) && defined(__AVX512VPOPCNTDQ__)) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__AVX512VNNI__) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - (defined(__AVX512VL__) && defined(__AVX512VNNI__)) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__AVXVNNI__) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__AVX512DQ__) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - (defined(__AVX512VL__) && defined(__AVX512BITALG__)) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - (defined(__AVX512VL__) && defined(__AVX512BW__)) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - (defined(__AVX512VL__) && defined(__AVX512CD__)) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - (defined(__AVX512VL__) && defined(__AVX512DQ__)) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__AVX512ER__) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__AVX512IFMA__) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - (defined(__AVX512IFMA__) && defined(__AVX512VL__)) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__AVX512VBMI__) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - (defined(__AVX512VBMI__) && defined(__AVX512VL__)) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__AVX512VBMI2__) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - (defined(__AVX512VBMI2__) && defined(__AVX512VL__)) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__AVX512PF__) -#include -#endif - -/* - * FIXME: _Float16 type is legal only when HW support float16 operation. - * We use __AVX512FP16__ to identify if float16 is supported or not, so - * when float16 is not supported, the related header is not included. - * - */ -#if defined(__AVX512FP16__) -#include -#endif - -#if defined(__AVX512FP16__) && defined(__AVX512VL__) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__AVX512BF16__) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - (defined(__AVX512VL__) && defined(__AVX512BF16__)) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__PKU__) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__VPCLMULQDQ__) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__VAES__) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__GFNI__) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__RDPID__) -/// Returns the value of the IA32_TSC_AUX MSR (0xc0000103). -/// -/// \headerfile -/// -/// This intrinsic corresponds to the RDPID instruction. -static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("rdpid"))) -_rdpid_u32(void) { - return __builtin_ia32_rdpid(); -} -#endif // __RDPID__ - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__RDRND__) -static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd"))) -_rdrand16_step(unsigned short *__p) -{ - return __builtin_ia32_rdrand16_step(__p); -} - -static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd"))) -_rdrand32_step(unsigned int *__p) -{ - return __builtin_ia32_rdrand32_step(__p); -} - -#ifdef __x86_64__ -static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("rdrnd"))) -_rdrand64_step(unsigned long long *__p) -{ - return __builtin_ia32_rdrand64_step(__p); -} -#endif -#endif /* __RDRND__ */ - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__FSGSBASE__) -#ifdef __x86_64__ -static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase"))) -_readfsbase_u32(void) -{ - return __builtin_ia32_rdfsbase32(); -} - -static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase"))) -_readfsbase_u64(void) -{ - return __builtin_ia32_rdfsbase64(); -} - -static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase"))) -_readgsbase_u32(void) -{ - return __builtin_ia32_rdgsbase32(); -} - -static __inline__ unsigned long long __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase"))) -_readgsbase_u64(void) -{ - return __builtin_ia32_rdgsbase64(); -} - -static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase"))) -_writefsbase_u32(unsigned int __V) -{ - __builtin_ia32_wrfsbase32(__V); -} - -static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase"))) -_writefsbase_u64(unsigned long long __V) -{ - __builtin_ia32_wrfsbase64(__V); -} - -static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase"))) -_writegsbase_u32(unsigned int __V) -{ - __builtin_ia32_wrgsbase32(__V); -} - -static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("fsgsbase"))) -_writegsbase_u64(unsigned long long __V) -{ - __builtin_ia32_wrgsbase64(__V); -} - -#endif -#endif /* __FSGSBASE__ */ - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__MOVBE__) - -/* The structs used below are to force the load/store to be unaligned. This - * is accomplished with the __packed__ attribute. The __may_alias__ prevents - * tbaa metadata from being generated based on the struct and the type of the - * field inside of it. - */ - -static __inline__ short __attribute__((__always_inline__, __nodebug__, __target__("movbe"))) -_loadbe_i16(void const * __P) { - struct __loadu_i16 { - short __v; - } __attribute__((__packed__, __may_alias__)); - return __builtin_bswap16(((const struct __loadu_i16*)__P)->__v); -} - -static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("movbe"))) -_storebe_i16(void * __P, short __D) { - struct __storeu_i16 { - short __v; - } __attribute__((__packed__, __may_alias__)); - ((struct __storeu_i16*)__P)->__v = __builtin_bswap16(__D); -} - -static __inline__ int __attribute__((__always_inline__, __nodebug__, __target__("movbe"))) -_loadbe_i32(void const * __P) { - struct __loadu_i32 { - int __v; - } __attribute__((__packed__, __may_alias__)); - return __builtin_bswap32(((const struct __loadu_i32*)__P)->__v); -} - -static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("movbe"))) -_storebe_i32(void * __P, int __D) { - struct __storeu_i32 { - int __v; - } __attribute__((__packed__, __may_alias__)); - ((struct __storeu_i32*)__P)->__v = __builtin_bswap32(__D); -} - -#ifdef __x86_64__ -static __inline__ long long __attribute__((__always_inline__, __nodebug__, __target__("movbe"))) -_loadbe_i64(void const * __P) { - struct __loadu_i64 { - long long __v; - } __attribute__((__packed__, __may_alias__)); - return __builtin_bswap64(((const struct __loadu_i64*)__P)->__v); -} - -static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("movbe"))) -_storebe_i64(void * __P, long long __D) { - struct __storeu_i64 { - long long __v; - } __attribute__((__packed__, __may_alias__)); - ((struct __storeu_i64*)__P)->__v = __builtin_bswap64(__D); -} -#endif -#endif /* __MOVBE */ - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__RTM__) -#include -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__SHA__) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__FXSR__) -#include -#endif - -/* No feature check desired due to internal MSC_VER checks */ -#include - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__XSAVEOPT__) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__XSAVEC__) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__XSAVES__) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__SHSTK__) -#include -#endif - -/* Some intrinsics inside adxintrin.h are available only on processors with ADX, - * whereas others are also available at all times. */ -#include - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__RDSEED__) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__WBNOINVD__) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__CLDEMOTE__) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__WAITPKG__) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__MOVDIRI__) || defined(__MOVDIR64B__) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__PCONFIG__) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__SGX__) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__PTWRITE__) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__INVPCID__) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__KL__) || defined(__WIDEKL__) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__AMXTILE__) || defined(__AMXINT8__) || defined(__AMXBF16__) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__AVX512VP2INTERSECT__) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - (defined(__AVX512VL__) && defined(__AVX512VP2INTERSECT__)) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__ENQCMD__) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__SERIALIZE__) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__TSXLDTRK__) -#include -#endif - -#if defined(_MSC_VER) && __has_extension(gnu_asm) -/* Define the default attributes for these intrinsics */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__)) -#ifdef __cplusplus -extern "C" { -#endif -/*----------------------------------------------------------------------------*\ -|* Interlocked Exchange HLE -\*----------------------------------------------------------------------------*/ -#if defined(__i386__) || defined(__x86_64__) -static __inline__ long __DEFAULT_FN_ATTRS -_InterlockedExchange_HLEAcquire(long volatile *_Target, long _Value) { - __asm__ __volatile__(".byte 0xf2 ; lock ; xchg {%0, %1|%1, %0}" - : "+r" (_Value), "+m" (*_Target) :: "memory"); - return _Value; -} -static __inline__ long __DEFAULT_FN_ATTRS -_InterlockedExchange_HLERelease(long volatile *_Target, long _Value) { - __asm__ __volatile__(".byte 0xf3 ; lock ; xchg {%0, %1|%1, %0}" - : "+r" (_Value), "+m" (*_Target) :: "memory"); - return _Value; -} -#endif -#if defined(__x86_64__) -static __inline__ __int64 __DEFAULT_FN_ATTRS -_InterlockedExchange64_HLEAcquire(__int64 volatile *_Target, __int64 _Value) { - __asm__ __volatile__(".byte 0xf2 ; lock ; xchg {%0, %1|%1, %0}" - : "+r" (_Value), "+m" (*_Target) :: "memory"); - return _Value; -} -static __inline__ __int64 __DEFAULT_FN_ATTRS -_InterlockedExchange64_HLERelease(__int64 volatile *_Target, __int64 _Value) { - __asm__ __volatile__(".byte 0xf3 ; lock ; xchg {%0, %1|%1, %0}" - : "+r" (_Value), "+m" (*_Target) :: "memory"); - return _Value; -} -#endif -/*----------------------------------------------------------------------------*\ -|* Interlocked Compare Exchange HLE -\*----------------------------------------------------------------------------*/ -#if defined(__i386__) || defined(__x86_64__) -static __inline__ long __DEFAULT_FN_ATTRS -_InterlockedCompareExchange_HLEAcquire(long volatile *_Destination, - long _Exchange, long _Comparand) { - __asm__ __volatile__(".byte 0xf2 ; lock ; cmpxchg {%2, %1|%1, %2}" - : "+a" (_Comparand), "+m" (*_Destination) - : "r" (_Exchange) : "memory"); - return _Comparand; -} -static __inline__ long __DEFAULT_FN_ATTRS -_InterlockedCompareExchange_HLERelease(long volatile *_Destination, - long _Exchange, long _Comparand) { - __asm__ __volatile__(".byte 0xf3 ; lock ; cmpxchg {%2, %1|%1, %2}" - : "+a" (_Comparand), "+m" (*_Destination) - : "r" (_Exchange) : "memory"); - return _Comparand; -} -#endif -#if defined(__x86_64__) -static __inline__ __int64 __DEFAULT_FN_ATTRS -_InterlockedCompareExchange64_HLEAcquire(__int64 volatile *_Destination, - __int64 _Exchange, __int64 _Comparand) { - __asm__ __volatile__(".byte 0xf2 ; lock ; cmpxchg {%2, %1|%1, %2}" - : "+a" (_Comparand), "+m" (*_Destination) - : "r" (_Exchange) : "memory"); - return _Comparand; -} -static __inline__ __int64 __DEFAULT_FN_ATTRS -_InterlockedCompareExchange64_HLERelease(__int64 volatile *_Destination, - __int64 _Exchange, __int64 _Comparand) { - __asm__ __volatile__(".byte 0xf3 ; lock ; cmpxchg {%2, %1|%1, %2}" - : "+a" (_Comparand), "+m" (*_Destination) - : "r" (_Exchange) : "memory"); - return _Comparand; -} -#endif -#ifdef __cplusplus -} -#endif - -#undef __DEFAULT_FN_ATTRS - -#endif /* defined(_MSC_VER) && __has_extension(gnu_asm) */ - -#endif /* __IMMINTRIN_H */ diff --git a/include/invpcidintrin.h b/include/invpcidintrin.h deleted file mode 100644 index 48dae0a..0000000 --- a/include/invpcidintrin.h +++ /dev/null @@ -1,23 +0,0 @@ -/*===------------- invpcidintrin.h - INVPCID intrinsic ---------------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ - -#ifndef __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __INVPCIDINTRIN_H -#define __INVPCIDINTRIN_H - -static __inline__ void - __attribute__((__always_inline__, __nodebug__, __target__("invpcid"))) -_invpcid(unsigned int __type, void *__descriptor) { - __builtin_ia32_invpcid(__type, __descriptor); -} - -#endif /* __INVPCIDINTRIN_H */ diff --git a/include/keylockerintrin.h b/include/keylockerintrin.h deleted file mode 100644 index ad9428e..0000000 --- a/include/keylockerintrin.h +++ /dev/null @@ -1,530 +0,0 @@ -/*===----------------- keylockerintrin.h - KL Intrinsics -------------------=== - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - * THE SOFTWARE. - * - *===-----------------------------------------------------------------------=== - */ - -#ifndef __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef _KEYLOCKERINTRIN_H -#define _KEYLOCKERINTRIN_H - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__KL__) - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS \ - __attribute__((__always_inline__, __nodebug__, __target__("kl"),\ - __min_vector_width__(128))) - -/// Load internal wrapping key from __intkey, __enkey_lo and __enkey_hi. __ctl -/// will assigned to EAX, whch specifies the KeySource and whether backing up -/// the key is permitted. The 256-bit encryption key is loaded from the two -/// explicit operands (__enkey_lo and __enkey_hi). The 128-bit integrity key is -/// loaded from the implicit operand XMM0 which assigned by __intkey. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the LOADIWKEY instructions. -/// -/// \operation -/// IF CPL > 0 // LOADKWKEY only allowed at ring 0 (supervisor mode) -/// GP (0) -/// FI -/// IF “LOADIWKEY exiting” VM execution control set -/// VMexit -/// FI -/// IF __ctl[4:1] > 1 // Reserved KeySource encoding used -/// GP (0) -/// FI -/// IF __ctl[31:5] != 0 // Reserved bit in __ctl is set -/// GP (0) -/// FI -/// IF __ctl[0] AND (CPUID.19H.ECX[0] == 0) // NoBackup is not supported on this part -/// GP (0) -/// FI -/// IF (__ctl[4:1] == 1) AND (CPUID.19H.ECX[1] == 0) // KeySource of 1 is not supported on this part -/// GP (0) -/// FI -/// IF (__ctl[4:1] == 0) // KeySource of 0. -/// IWKey.Encryption Key[127:0] := __enkey_hi[127:0]: -/// IWKey.Encryption Key[255:128] := __enkey_lo[127:0] -/// IWKey.IntegrityKey[127:0] := __intkey[127:0] -/// IWKey.NoBackup := __ctl[0] -/// IWKey.KeySource := __ctl[4:1] -/// ZF := 0 -/// ELSE // KeySource of 1. See RDSEED definition for details of randomness -/// IF HW_NRND_GEN.ready == 1 // Full-entropy random data from RDSEED was received -/// IWKey.Encryption Key[127:0] := __enkey_hi[127:0] XOR HW_NRND_GEN.data[127:0] -/// IWKey.Encryption Key[255:128] := __enkey_lo[127:0] XOR HW_NRND_GEN.data[255:128] -/// IWKey.Encryption Key[255:0] := __enkey_hi[127:0]:__enkey_lo[127:0] XOR HW_NRND_GEN.data[255:0] -/// IWKey.IntegrityKey[127:0] := __intkey[127:0] XOR HW_NRND_GEN.data[383:256] -/// IWKey.NoBackup := __ctl[0] -/// IWKey.KeySource := __ctl[4:1] -/// ZF := 0 -/// ELSE // Random data was not returned from RDSEED. IWKey was not loaded -/// ZF := 1 -/// FI -/// FI -/// dst := ZF -/// OF := 0 -/// SF := 0 -/// AF := 0 -/// PF := 0 -/// CF := 0 -/// \endoperation -static __inline__ void __DEFAULT_FN_ATTRS -_mm_loadiwkey (unsigned int __ctl, __m128i __intkey, - __m128i __enkey_lo, __m128i __enkey_hi) { - __builtin_ia32_loadiwkey (__intkey, __enkey_lo, __enkey_hi, __ctl); -} - -/// Wrap a 128-bit AES key from __key into a key handle and output in -/// ((__m128i*)__h) to ((__m128i*)__h) + 2 and a 32-bit value as return. -/// The explicit source operand __htype specifies handle restrictions. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the ENCODEKEY128 instructions. -/// -/// \operation -/// InputKey[127:0] := __key[127:0] -/// KeyMetadata[2:0] := __htype[2:0] -/// KeyMetadata[23:3] := 0 // Reserved for future usage -/// KeyMetadata[27:24] := 0 // KeyType is AES-128 (value of 0) -/// KeyMetadata[127:28] := 0 // Reserved for future usage -/// Handle[383:0] := WrapKey128(InputKey[127:0], KeyMetadata[127:0], -/// IWKey.Integrity Key[127:0], IWKey.Encryption Key[255:0]) -/// dst[0] := IWKey.NoBackup -/// dst[4:1] := IWKey.KeySource[3:0] -/// dst[31:5] := 0 -/// MEM[__h+127:__h] := Handle[127:0] // AAD -/// MEM[__h+255:__h+128] := Handle[255:128] // Integrity Tag -/// MEM[__h+383:__h+256] := Handle[383:256] // CipherText -/// OF := 0 -/// SF := 0 -/// ZF := 0 -/// AF := 0 -/// PF := 0 -/// CF := 0 -/// \endoperation -static __inline__ unsigned int __DEFAULT_FN_ATTRS -_mm_encodekey128_u32(unsigned int __htype, __m128i __key, void *__h) { - return __builtin_ia32_encodekey128_u32(__htype, (__v2di)__key, __h); -} - -/// Wrap a 256-bit AES key from __key_hi:__key_lo into a key handle, then -/// output handle in ((__m128i*)__h) to ((__m128i*)__h) + 3 and -/// a 32-bit value as return. -/// The explicit source operand __htype specifies handle restrictions. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the ENCODEKEY256 instructions. -/// -/// \operation -/// InputKey[127:0] := __key_lo[127:0] -/// InputKey[255:128] := __key_hi[255:128] -/// KeyMetadata[2:0] := __htype[2:0] -/// KeyMetadata[23:3] := 0 // Reserved for future usage -/// KeyMetadata[27:24] := 1 // KeyType is AES-256 (value of 1) -/// KeyMetadata[127:28] := 0 // Reserved for future usage -/// Handle[511:0] := WrapKey256(InputKey[255:0], KeyMetadata[127:0], -/// IWKey.Integrity Key[127:0], IWKey.Encryption Key[255:0]) -/// dst[0] := IWKey.NoBackup -/// dst[4:1] := IWKey.KeySource[3:0] -/// dst[31:5] := 0 -/// MEM[__h+127:__h] := Handle[127:0] // AAD -/// MEM[__h+255:__h+128] := Handle[255:128] // Tag -/// MEM[__h+383:__h+256] := Handle[383:256] // CipherText[127:0] -/// MEM[__h+511:__h+384] := Handle[511:384] // CipherText[255:128] -/// OF := 0 -/// SF := 0 -/// ZF := 0 -/// AF := 0 -/// PF := 0 -/// CF := 0 -/// \endoperation -static __inline__ unsigned int __DEFAULT_FN_ATTRS -_mm_encodekey256_u32(unsigned int __htype, __m128i __key_lo, __m128i __key_hi, - void *__h) { - return __builtin_ia32_encodekey256_u32(__htype, (__v2di)__key_lo, - (__v2di)__key_hi, __h); -} - -/// The AESENC128KL performs 10 rounds of AES to encrypt the __idata using -/// the 128-bit key in the handle from the __h. It stores the result in the -/// __odata. And return the affected ZF flag status. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the AESENC128KL instructions. -/// -/// \operation -/// Handle[383:0] := MEM[__h+383:__h] // Load is not guaranteed to be atomic. -/// IllegalHandle := ( HandleReservedBitSet (Handle[383:0]) || -/// (Handle[127:0] AND (CPL > 0)) || -/// Handle[383:256] || -/// HandleKeyType (Handle[383:0]) != HANDLE_KEY_TYPE_AES128 ) -/// IF (IllegalHandle) -/// ZF := 1 -/// ELSE -/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey) -/// IF (Authentic == 0) -/// ZF := 1 -/// ELSE -/// MEM[__odata+127:__odata] := AES128Encrypt (__idata[127:0], UnwrappedKey) -/// ZF := 0 -/// FI -/// FI -/// dst := ZF -/// OF := 0 -/// SF := 0 -/// AF := 0 -/// PF := 0 -/// CF := 0 -/// \endoperation -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_mm_aesenc128kl_u8(__m128i* __odata, __m128i __idata, const void *__h) { - return __builtin_ia32_aesenc128kl_u8((__v2di *)__odata, (__v2di)__idata, __h); -} - -/// The AESENC256KL performs 14 rounds of AES to encrypt the __idata using -/// the 256-bit key in the handle from the __h. It stores the result in the -/// __odata. And return the affected ZF flag status. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the AESENC256KL instructions. -/// -/// \operation -/// Handle[511:0] := MEM[__h+511:__h] // Load is not guaranteed to be atomic. -/// IllegalHandle := ( HandleReservedBitSet (Handle[511:0]) || -/// (Handle[127:0] AND (CPL > 0)) || -/// Handle[255:128] || -/// HandleKeyType (Handle[511:0]) != HANDLE_KEY_TYPE_AES256 ) -/// IF (IllegalHandle) -/// ZF := 1 -/// MEM[__odata+127:__odata] := 0 -/// ELSE -/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey) -/// IF (Authentic == 0) -/// ZF := 1 -/// MEM[__odata+127:__odata] := 0 -/// ELSE -/// MEM[__odata+127:__odata] := AES256Encrypt (__idata[127:0], UnwrappedKey) -/// ZF := 0 -/// FI -/// FI -/// dst := ZF -/// OF := 0 -/// SF := 0 -/// AF := 0 -/// PF := 0 -/// CF := 0 -/// \endoperation -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_mm_aesenc256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) { - return __builtin_ia32_aesenc256kl_u8((__v2di *)__odata, (__v2di)__idata, __h); -} - -/// The AESDEC128KL performs 10 rounds of AES to decrypt the __idata using -/// the 128-bit key in the handle from the __h. It stores the result in the -/// __odata. And return the affected ZF flag status. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the AESDEC128KL instructions. -/// -/// \operation -/// Handle[383:0] := MEM[__h+383:__h] // Load is not guaranteed to be atomic. -/// IllegalHandle := (HandleReservedBitSet (Handle[383:0]) || -/// (Handle[127:0] AND (CPL > 0)) || -/// Handle[383:256] || -/// HandleKeyType (Handle[383:0]) != HANDLE_KEY_TYPE_AES128) -/// IF (IllegalHandle) -/// ZF := 1 -/// MEM[__odata+127:__odata] := 0 -/// ELSE -/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey) -/// IF (Authentic == 0) -/// ZF := 1 -/// MEM[__odata+127:__odata] := 0 -/// ELSE -/// MEM[__odata+127:__odata] := AES128Decrypt (__idata[127:0], UnwrappedKey) -/// ZF := 0 -/// FI -/// FI -/// dst := ZF -/// OF := 0 -/// SF := 0 -/// AF := 0 -/// PF := 0 -/// CF := 0 -/// \endoperation -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_mm_aesdec128kl_u8(__m128i* __odata, __m128i __idata, const void *__h) { - return __builtin_ia32_aesdec128kl_u8((__v2di *)__odata, (__v2di)__idata, __h); -} - -/// The AESDEC256KL performs 10 rounds of AES to decrypt the __idata using -/// the 256-bit key in the handle from the __h. It stores the result in the -/// __odata. And return the affected ZF flag status. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the AESDEC256KL instructions. -/// -/// \operation -/// Handle[511:0] := MEM[__h+511:__h] -/// IllegalHandle := (HandleReservedBitSet (Handle[511:0]) || -/// (Handle[127:0] AND (CPL > 0)) || -/// Handle[383:256] || -/// HandleKeyType (Handle[511:0]) != HANDLE_KEY_TYPE_AES256) -/// IF (IllegalHandle) -/// ZF := 1 -/// MEM[__odata+127:__odata] := 0 -/// ELSE -/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey) -/// IF (Authentic == 0) -/// ZF := 1 -/// MEM[__odata+127:__odata] := 0 -/// ELSE -/// MEM[__odata+127:__odata] := AES256Decrypt (__idata[127:0], UnwrappedKey) -/// ZF := 0 -/// FI -/// FI -/// dst := ZF -/// OF := 0 -/// SF := 0 -/// AF := 0 -/// PF := 0 -/// CF := 0 -/// \endoperation -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_mm_aesdec256kl_u8(__m128i* __odata, __m128i __idata, const void *__h) { - return __builtin_ia32_aesdec256kl_u8((__v2di *)__odata, (__v2di)__idata, __h); -} - -#undef __DEFAULT_FN_ATTRS - -#endif /* !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) \ - || defined(__KL__) */ - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__WIDEKL__) - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS \ - __attribute__((__always_inline__, __nodebug__, __target__("kl,widekl"),\ - __min_vector_width__(128))) - -/// Encrypt __idata[0] to __idata[7] using 128-bit AES key indicated by handle -/// at __h and store each resultant block back from __odata to __odata+7. And -/// return the affected ZF flag status. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the AESENCWIDE128KL instructions. -/// -/// \operation -/// Handle := MEM[__h+383:__h] -/// IllegalHandle := ( HandleReservedBitSet (Handle[383:0]) || -/// (Handle[127:0] AND (CPL > 0)) || -/// Handle[255:128] || -/// HandleKeyType (Handle[383:0]) != HANDLE_KEY_TYPE_AES128 ) -/// IF (IllegalHandle) -/// ZF := 1 -/// FOR i := 0 to 7 -/// __odata[i] := 0 -/// ENDFOR -/// ELSE -/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey) -/// IF Authentic == 0 -/// ZF := 1 -/// FOR i := 0 to 7 -/// __odata[i] := 0 -/// ENDFOR -/// ELSE -/// FOR i := 0 to 7 -/// __odata[i] := AES128Encrypt (__idata[i], UnwrappedKey) -/// ENDFOR -/// ZF := 0 -/// FI -/// FI -/// dst := ZF -/// OF := 0 -/// SF := 0 -/// AF := 0 -/// PF := 0 -/// CF := 0 -/// \endoperation -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_mm_aesencwide128kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) { - return __builtin_ia32_aesencwide128kl_u8((__v2di *)__odata, - (const __v2di *)__idata, __h); -} - -/// Encrypt __idata[0] to __idata[7] using 256-bit AES key indicated by handle -/// at __h and store each resultant block back from __odata to __odata+7. And -/// return the affected ZF flag status. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the AESENCWIDE256KL instructions. -/// -/// \operation -/// Handle[511:0] := MEM[__h+511:__h] -/// IllegalHandle := ( HandleReservedBitSet (Handle[511:0]) || -/// (Handle[127:0] AND (CPL > 0)) || -/// Handle[255:128] || -/// HandleKeyType (Handle[511:0]) != HANDLE_KEY_TYPE_AES512 ) -/// IF (IllegalHandle) -/// ZF := 1 -/// FOR i := 0 to 7 -/// __odata[i] := 0 -/// ENDFOR -/// ELSE -/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey) -/// IF Authentic == 0 -/// ZF := 1 -/// FOR i := 0 to 7 -/// __odata[i] := 0 -/// ENDFOR -/// ELSE -/// FOR i := 0 to 7 -/// __odata[i] := AES256Encrypt (__idata[i], UnwrappedKey) -/// ENDFOR -/// ZF := 0 -/// FI -/// FI -/// dst := ZF -/// OF := 0 -/// SF := 0 -/// AF := 0 -/// PF := 0 -/// CF := 0 -/// \endoperation -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_mm_aesencwide256kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) { - return __builtin_ia32_aesencwide256kl_u8((__v2di *)__odata, - (const __v2di *)__idata, __h); -} - -/// Decrypt __idata[0] to __idata[7] using 128-bit AES key indicated by handle -/// at __h and store each resultant block back from __odata to __odata+7. And -/// return the affected ZF flag status. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the AESDECWIDE128KL instructions. -/// -/// \operation -/// Handle[383:0] := MEM[__h+383:__h] -/// IllegalHandle := ( HandleReservedBitSet (Handle[383:0]) || -/// (Handle[127:0] AND (CPL > 0)) || -/// Handle[255:128] || -/// HandleKeyType (Handle) != HANDLE_KEY_TYPE_AES128 ) -/// IF (IllegalHandle) -/// ZF := 1 -/// FOR i := 0 to 7 -/// __odata[i] := 0 -/// ENDFOR -/// ELSE -/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate384 (Handle[383:0], IWKey) -/// IF Authentic == 0 -/// ZF := 1 -/// FOR i := 0 to 7 -/// __odata[i] := 0 -/// ENDFOR -/// ELSE -/// FOR i := 0 to 7 -/// __odata[i] := AES128Decrypt (__idata[i], UnwrappedKey) -/// ENDFOR -/// ZF := 0 -/// FI -/// FI -/// dst := ZF -/// OF := 0 -/// SF := 0 -/// AF := 0 -/// PF := 0 -/// CF := 0 -/// \endoperation -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_mm_aesdecwide128kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) { - return __builtin_ia32_aesdecwide128kl_u8((__v2di *)__odata, - (const __v2di *)__idata, __h); -} - -/// Decrypt __idata[0] to __idata[7] using 256-bit AES key indicated by handle -/// at __h and store each resultant block back from __odata to __odata+7. And -/// return the affected ZF flag status. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the AESDECWIDE256KL instructions. -/// -/// \operation -/// Handle[511:0] := MEM[__h+511:__h] -/// IllegalHandle = ( HandleReservedBitSet (Handle[511:0]) || -/// (Handle[127:0] AND (CPL > 0)) || -/// Handle[255:128] || -/// HandleKeyType (Handle) != HANDLE_KEY_TYPE_AES512 ) -/// If (IllegalHandle) -/// ZF := 1 -/// FOR i := 0 to 7 -/// __odata[i] := 0 -/// ENDFOR -/// ELSE -/// (UnwrappedKey, Authentic) := UnwrapKeyAndAuthenticate512 (Handle[511:0], IWKey) -/// IF Authentic == 0 -/// ZF := 1 -/// FOR i := 0 to 7 -/// __odata[i] := 0 -/// ENDFOR -/// ELSE -/// FOR i := 0 to 7 -/// __odata[i] := AES256Decrypt (__idata[i], UnwrappedKey) -/// ENDFOR -/// ZF := 0 -/// FI -/// FI -/// dst := ZF -/// OF := 0 -/// SF := 0 -/// AF := 0 -/// PF := 0 -/// CF := 0 -/// \endoperation -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_mm_aesdecwide256kl_u8(__m128i __odata[8], const __m128i __idata[8], const void* __h) { - return __builtin_ia32_aesdecwide256kl_u8((__v2di *)__odata, - (const __v2di *)__idata, __h); -} - -#undef __DEFAULT_FN_ATTRS - -#endif /* !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) \ - || defined(__WIDEKL__) */ - -#endif /* _KEYLOCKERINTRIN_H */ diff --git a/include/lwpintrin.h b/include/lwpintrin.h deleted file mode 100644 index d8ab0db..0000000 --- a/include/lwpintrin.h +++ /dev/null @@ -1,136 +0,0 @@ -/*===---- lwpintrin.h - LWP intrinsics -------------------------------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ - -#ifndef __X86INTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __LWPINTRIN_H -#define __LWPINTRIN_H - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("lwp"))) - -/// Parses the LWPCB at the specified address and enables -/// profiling if valid. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the LLWPCB instruction. -/// -/// \param __addr -/// Address to the new Lightweight Profiling Control Block (LWPCB). If the -/// LWPCB is valid, writes the address into the LWP_CBADDR MSR and enables -/// Lightweight Profiling. -static __inline__ void __DEFAULT_FN_ATTRS -__llwpcb (void *__addr) -{ - __builtin_ia32_llwpcb(__addr); -} - -/// Flushes the LWP state to memory and returns the address of the LWPCB. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the SLWPCB instruction. -/// -/// \return -/// Address to the current Lightweight Profiling Control Block (LWPCB). -/// If LWP is not currently enabled, returns NULL. -static __inline__ void* __DEFAULT_FN_ATTRS -__slwpcb (void) -{ - return __builtin_ia32_slwpcb(); -} - -/// Inserts programmed event record into the LWP event ring buffer -/// and advances the ring buffer pointer. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the LWPINS instruction. -/// -/// \param DATA2 -/// A 32-bit value is zero-extended and inserted into the 64-bit Data2 field. -/// \param DATA1 -/// A 32-bit value is inserted into the 32-bit Data1 field. -/// \param FLAGS -/// A 32-bit immediate value is inserted into the 32-bit Flags field. -/// \returns If the ring buffer is full and LWP is running in Synchronized Mode, -/// the event record overwrites the last record in the buffer, the MissedEvents -/// counter in the LWPCB is incremented, the head pointer is not advanced, and -/// 1 is returned. Otherwise 0 is returned. -#define __lwpins32(DATA2, DATA1, FLAGS) \ - (__builtin_ia32_lwpins32((unsigned int) (DATA2), (unsigned int) (DATA1), \ - (unsigned int) (FLAGS))) - -/// Decrements the LWP programmed value sample event counter. If the result is -/// negative, inserts an event record into the LWP event ring buffer in memory -/// and advances the ring buffer pointer. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the LWPVAL instruction. -/// -/// \param DATA2 -/// A 32-bit value is zero-extended and inserted into the 64-bit Data2 field. -/// \param DATA1 -/// A 32-bit value is inserted into the 32-bit Data1 field. -/// \param FLAGS -/// A 32-bit immediate value is inserted into the 32-bit Flags field. -#define __lwpval32(DATA2, DATA1, FLAGS) \ - (__builtin_ia32_lwpval32((unsigned int) (DATA2), (unsigned int) (DATA1), \ - (unsigned int) (FLAGS))) - -#ifdef __x86_64__ - -/// Inserts programmed event record into the LWP event ring buffer -/// and advances the ring buffer pointer. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the LWPINS instruction. -/// -/// \param DATA2 -/// A 64-bit value is inserted into the 64-bit Data2 field. -/// \param DATA1 -/// A 32-bit value is inserted into the 32-bit Data1 field. -/// \param FLAGS -/// A 32-bit immediate value is inserted into the 32-bit Flags field. -/// \returns If the ring buffer is full and LWP is running in Synchronized Mode, -/// the event record overwrites the last record in the buffer, the MissedEvents -/// counter in the LWPCB is incremented, the head pointer is not advanced, and -/// 1 is returned. Otherwise 0 is returned. -#define __lwpins64(DATA2, DATA1, FLAGS) \ - (__builtin_ia32_lwpins64((unsigned long long) (DATA2), (unsigned int) (DATA1), \ - (unsigned int) (FLAGS))) - -/// Decrements the LWP programmed value sample event counter. If the result is -/// negative, inserts an event record into the LWP event ring buffer in memory -/// and advances the ring buffer pointer. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the LWPVAL instruction. -/// -/// \param DATA2 -/// A 64-bit value is and inserted into the 64-bit Data2 field. -/// \param DATA1 -/// A 32-bit value is inserted into the 32-bit Data1 field. -/// \param FLAGS -/// A 32-bit immediate value is inserted into the 32-bit Flags field. -#define __lwpval64(DATA2, DATA1, FLAGS) \ - (__builtin_ia32_lwpval64((unsigned long long) (DATA2), (unsigned int) (DATA1), \ - (unsigned int) (FLAGS))) - -#endif - -#undef __DEFAULT_FN_ATTRS - -#endif /* __LWPINTRIN_H */ diff --git a/include/lzcntintrin.h b/include/lzcntintrin.h deleted file mode 100644 index f4ddce9..0000000 --- a/include/lzcntintrin.h +++ /dev/null @@ -1,104 +0,0 @@ -/*===---- lzcntintrin.h - LZCNT intrinsics ---------------------------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ - -#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __LZCNTINTRIN_H -#define __LZCNTINTRIN_H - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("lzcnt"))) - -#ifndef _MSC_VER -/// Counts the number of leading zero bits in the operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the \c LZCNT instruction. -/// -/// \param __X -/// An unsigned 16-bit integer whose leading zeros are to be counted. -/// \returns An unsigned 16-bit integer containing the number of leading zero -/// bits in the operand. -#define __lzcnt16(X) __builtin_ia32_lzcnt_u16((unsigned short)(X)) -#endif // _MSC_VER - -/// Counts the number of leading zero bits in the operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the \c LZCNT instruction. -/// -/// \param __X -/// An unsigned 32-bit integer whose leading zeros are to be counted. -/// \returns An unsigned 32-bit integer containing the number of leading zero -/// bits in the operand. -/// \see _lzcnt_u32 -static __inline__ unsigned int __DEFAULT_FN_ATTRS -__lzcnt32(unsigned int __X) -{ - return __builtin_ia32_lzcnt_u32(__X); -} - -/// Counts the number of leading zero bits in the operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the \c LZCNT instruction. -/// -/// \param __X -/// An unsigned 32-bit integer whose leading zeros are to be counted. -/// \returns An unsigned 32-bit integer containing the number of leading zero -/// bits in the operand. -/// \see __lzcnt32 -static __inline__ unsigned int __DEFAULT_FN_ATTRS -_lzcnt_u32(unsigned int __X) -{ - return __builtin_ia32_lzcnt_u32(__X); -} - -#ifdef __x86_64__ -#ifndef _MSC_VER -/// Counts the number of leading zero bits in the operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the \c LZCNT instruction. -/// -/// \param __X -/// An unsigned 64-bit integer whose leading zeros are to be counted. -/// \returns An unsigned 64-bit integer containing the number of leading zero -/// bits in the operand. -/// \see _lzcnt_u64 -#define __lzcnt64(X) __builtin_ia32_lzcnt_u64((unsigned long long)(X)) -#endif // _MSC_VER - -/// Counts the number of leading zero bits in the operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the \c LZCNT instruction. -/// -/// \param __X -/// An unsigned 64-bit integer whose leading zeros are to be counted. -/// \returns An unsigned 64-bit integer containing the number of leading zero -/// bits in the operand. -/// \see __lzcnt64 -static __inline__ unsigned long long __DEFAULT_FN_ATTRS -_lzcnt_u64(unsigned long long __X) -{ - return __builtin_ia32_lzcnt_u64(__X); -} -#endif - -#undef __DEFAULT_FN_ATTRS - -#endif /* __LZCNTINTRIN_H */ diff --git a/include/mm3dnow.h b/include/mm3dnow.h deleted file mode 100644 index 22ab13a..0000000 --- a/include/mm3dnow.h +++ /dev/null @@ -1,157 +0,0 @@ -/*===---- mm3dnow.h - 3DNow! intrinsics ------------------------------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ - -#ifndef _MM3DNOW_H_INCLUDED -#define _MM3DNOW_H_INCLUDED - -#include -#include - -typedef float __v2sf __attribute__((__vector_size__(8))); - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("3dnow"), __min_vector_width__(64))) - -static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("3dnow"))) -_m_femms(void) { - __builtin_ia32_femms(); -} - -static __inline__ __m64 __DEFAULT_FN_ATTRS -_m_pavgusb(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pavgusb((__v8qi)__m1, (__v8qi)__m2); -} - -static __inline__ __m64 __DEFAULT_FN_ATTRS -_m_pf2id(__m64 __m) { - return (__m64)__builtin_ia32_pf2id((__v2sf)__m); -} - -static __inline__ __m64 __DEFAULT_FN_ATTRS -_m_pfacc(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pfacc((__v2sf)__m1, (__v2sf)__m2); -} - -static __inline__ __m64 __DEFAULT_FN_ATTRS -_m_pfadd(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pfadd((__v2sf)__m1, (__v2sf)__m2); -} - -static __inline__ __m64 __DEFAULT_FN_ATTRS -_m_pfcmpeq(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pfcmpeq((__v2sf)__m1, (__v2sf)__m2); -} - -static __inline__ __m64 __DEFAULT_FN_ATTRS -_m_pfcmpge(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pfcmpge((__v2sf)__m1, (__v2sf)__m2); -} - -static __inline__ __m64 __DEFAULT_FN_ATTRS -_m_pfcmpgt(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pfcmpgt((__v2sf)__m1, (__v2sf)__m2); -} - -static __inline__ __m64 __DEFAULT_FN_ATTRS -_m_pfmax(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pfmax((__v2sf)__m1, (__v2sf)__m2); -} - -static __inline__ __m64 __DEFAULT_FN_ATTRS -_m_pfmin(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pfmin((__v2sf)__m1, (__v2sf)__m2); -} - -static __inline__ __m64 __DEFAULT_FN_ATTRS -_m_pfmul(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pfmul((__v2sf)__m1, (__v2sf)__m2); -} - -static __inline__ __m64 __DEFAULT_FN_ATTRS -_m_pfrcp(__m64 __m) { - return (__m64)__builtin_ia32_pfrcp((__v2sf)__m); -} - -static __inline__ __m64 __DEFAULT_FN_ATTRS -_m_pfrcpit1(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pfrcpit1((__v2sf)__m1, (__v2sf)__m2); -} - -static __inline__ __m64 __DEFAULT_FN_ATTRS -_m_pfrcpit2(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pfrcpit2((__v2sf)__m1, (__v2sf)__m2); -} - -static __inline__ __m64 __DEFAULT_FN_ATTRS -_m_pfrsqrt(__m64 __m) { - return (__m64)__builtin_ia32_pfrsqrt((__v2sf)__m); -} - -static __inline__ __m64 __DEFAULT_FN_ATTRS -_m_pfrsqrtit1(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pfrsqit1((__v2sf)__m1, (__v2sf)__m2); -} - -static __inline__ __m64 __DEFAULT_FN_ATTRS -_m_pfsub(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pfsub((__v2sf)__m1, (__v2sf)__m2); -} - -static __inline__ __m64 __DEFAULT_FN_ATTRS -_m_pfsubr(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pfsubr((__v2sf)__m1, (__v2sf)__m2); -} - -static __inline__ __m64 __DEFAULT_FN_ATTRS -_m_pi2fd(__m64 __m) { - return (__m64)__builtin_ia32_pi2fd((__v2si)__m); -} - -static __inline__ __m64 __DEFAULT_FN_ATTRS -_m_pmulhrw(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pmulhrw((__v4hi)__m1, (__v4hi)__m2); -} - -/* Handle the 3dnowa instructions here. */ -#undef __DEFAULT_FN_ATTRS -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("3dnowa"), __min_vector_width__(64))) - -static __inline__ __m64 __DEFAULT_FN_ATTRS -_m_pf2iw(__m64 __m) { - return (__m64)__builtin_ia32_pf2iw((__v2sf)__m); -} - -static __inline__ __m64 __DEFAULT_FN_ATTRS -_m_pfnacc(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pfnacc((__v2sf)__m1, (__v2sf)__m2); -} - -static __inline__ __m64 __DEFAULT_FN_ATTRS -_m_pfpnacc(__m64 __m1, __m64 __m2) { - return (__m64)__builtin_ia32_pfpnacc((__v2sf)__m1, (__v2sf)__m2); -} - -static __inline__ __m64 __DEFAULT_FN_ATTRS -_m_pi2fw(__m64 __m) { - return (__m64)__builtin_ia32_pi2fw((__v2si)__m); -} - -static __inline__ __m64 __DEFAULT_FN_ATTRS -_m_pswapdsf(__m64 __m) { - return (__m64)__builtin_ia32_pswapdsf((__v2sf)__m); -} - -static __inline__ __m64 __DEFAULT_FN_ATTRS -_m_pswapdsi(__m64 __m) { - return (__m64)__builtin_ia32_pswapdsi((__v2si)__m); -} - -#undef __DEFAULT_FN_ATTRS - -#endif diff --git a/include/mm_malloc.h b/include/mm_malloc.h deleted file mode 100644 index 933dbaa..0000000 --- a/include/mm_malloc.h +++ /dev/null @@ -1,67 +0,0 @@ -/*===---- mm_malloc.h - Allocating and Freeing Aligned Memory Blocks -------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ - -#ifndef __MM_MALLOC_H -#define __MM_MALLOC_H - -#include - -#ifdef _WIN32 -#include -#else -#ifndef __cplusplus -extern int posix_memalign(void **__memptr, size_t __alignment, size_t __size); -#else -// Some systems (e.g. those with GNU libc) declare posix_memalign with an -// exception specifier. Via an "egregious workaround" in -// Sema::CheckEquivalentExceptionSpec, Clang accepts the following as a valid -// redeclaration of glibc's declaration. -extern "C" int posix_memalign(void **__memptr, size_t __alignment, size_t __size); -#endif -#endif - -#if !(defined(_WIN32) && defined(_mm_malloc)) -static __inline__ void *__attribute__((__always_inline__, __nodebug__, - __malloc__)) -_mm_malloc(size_t __size, size_t __align) -{ - if (__align == 1) { - return malloc(__size); - } - - if (!(__align & (__align - 1)) && __align < sizeof(void *)) - __align = sizeof(void *); - - void *__mallocedMemory; -#if defined(__MINGW32__) - __mallocedMemory = __mingw_aligned_malloc(__size, __align); -#elif defined(_WIN32) - __mallocedMemory = _aligned_malloc(__size, __align); -#else - if (posix_memalign(&__mallocedMemory, __align, __size)) - return 0; -#endif - - return __mallocedMemory; -} - -static __inline__ void __attribute__((__always_inline__, __nodebug__)) -_mm_free(void *__p) -{ -#if defined(__MINGW32__) - __mingw_aligned_free(__p); -#elif defined(_WIN32) - _aligned_free(__p); -#else - free(__p); -#endif -} -#endif - -#endif /* __MM_MALLOC_H */ diff --git a/include/mmintrin.h b/include/mmintrin.h deleted file mode 100644 index 03bac92..0000000 --- a/include/mmintrin.h +++ /dev/null @@ -1,1562 +0,0 @@ -/*===---- mmintrin.h - MMX intrinsics --------------------------------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ - -#ifndef __MMINTRIN_H -#define __MMINTRIN_H - -#if !defined(__i386__) && !defined(__x86_64__) -#error "This header is only meant to be used on x86 and x64 architecture" -#endif - -typedef long long __m64 __attribute__((__vector_size__(8), __aligned__(8))); - -typedef long long __v1di __attribute__((__vector_size__(8))); -typedef int __v2si __attribute__((__vector_size__(8))); -typedef short __v4hi __attribute__((__vector_size__(8))); -typedef char __v8qi __attribute__((__vector_size__(8))); - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("mmx"), __min_vector_width__(64))) - -/// Clears the MMX state by setting the state of the x87 stack registers -/// to empty. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the EMMS instruction. -/// -static __inline__ void __attribute__((__always_inline__, __nodebug__, __target__("mmx"))) -_mm_empty(void) -{ - __builtin_ia32_emms(); -} - -/// Constructs a 64-bit integer vector, setting the lower 32 bits to the -/// value of the 32-bit integer parameter and setting the upper 32 bits to 0. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the MOVD instruction. -/// -/// \param __i -/// A 32-bit integer value. -/// \returns A 64-bit integer vector. The lower 32 bits contain the value of the -/// parameter. The upper 32 bits are set to 0. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_cvtsi32_si64(int __i) -{ - return (__m64)__builtin_ia32_vec_init_v2si(__i, 0); -} - -/// Returns the lower 32 bits of a 64-bit integer vector as a 32-bit -/// signed integer. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the MOVD instruction. -/// -/// \param __m -/// A 64-bit integer vector. -/// \returns A 32-bit signed integer value containing the lower 32 bits of the -/// parameter. -static __inline__ int __DEFAULT_FN_ATTRS -_mm_cvtsi64_si32(__m64 __m) -{ - return __builtin_ia32_vec_ext_v2si((__v2si)__m, 0); -} - -/// Casts a 64-bit signed integer value into a 64-bit integer vector. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the MOVQ instruction. -/// -/// \param __i -/// A 64-bit signed integer. -/// \returns A 64-bit integer vector containing the same bitwise pattern as the -/// parameter. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_cvtsi64_m64(long long __i) -{ - return (__m64)__i; -} - -/// Casts a 64-bit integer vector into a 64-bit signed integer value. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the MOVQ instruction. -/// -/// \param __m -/// A 64-bit integer vector. -/// \returns A 64-bit signed integer containing the same bitwise pattern as the -/// parameter. -static __inline__ long long __DEFAULT_FN_ATTRS -_mm_cvtm64_si64(__m64 __m) -{ - return (long long)__m; -} - -/// Converts 16-bit signed integers from both 64-bit integer vector -/// parameters of [4 x i16] into 8-bit signed integer values, and constructs -/// a 64-bit integer vector of [8 x i8] as the result. Positive values -/// greater than 0x7F are saturated to 0x7F. Negative values less than 0x80 -/// are saturated to 0x80. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PACKSSWB instruction. -/// -/// \param __m1 -/// A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a -/// 16-bit signed integer and is converted to an 8-bit signed integer with -/// saturation. Positive values greater than 0x7F are saturated to 0x7F. -/// Negative values less than 0x80 are saturated to 0x80. The converted -/// [4 x i8] values are written to the lower 32 bits of the result. -/// \param __m2 -/// A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a -/// 16-bit signed integer and is converted to an 8-bit signed integer with -/// saturation. Positive values greater than 0x7F are saturated to 0x7F. -/// Negative values less than 0x80 are saturated to 0x80. The converted -/// [4 x i8] values are written to the upper 32 bits of the result. -/// \returns A 64-bit integer vector of [8 x i8] containing the converted -/// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_packs_pi16(__m64 __m1, __m64 __m2) -{ - return (__m64)__builtin_ia32_packsswb((__v4hi)__m1, (__v4hi)__m2); -} - -/// Converts 32-bit signed integers from both 64-bit integer vector -/// parameters of [2 x i32] into 16-bit signed integer values, and constructs -/// a 64-bit integer vector of [4 x i16] as the result. Positive values -/// greater than 0x7FFF are saturated to 0x7FFF. Negative values less than -/// 0x8000 are saturated to 0x8000. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PACKSSDW instruction. -/// -/// \param __m1 -/// A 64-bit integer vector of [2 x i32]. Each 32-bit element is treated as a -/// 32-bit signed integer and is converted to a 16-bit signed integer with -/// saturation. Positive values greater than 0x7FFF are saturated to 0x7FFF. -/// Negative values less than 0x8000 are saturated to 0x8000. The converted -/// [2 x i16] values are written to the lower 32 bits of the result. -/// \param __m2 -/// A 64-bit integer vector of [2 x i32]. Each 32-bit element is treated as a -/// 32-bit signed integer and is converted to a 16-bit signed integer with -/// saturation. Positive values greater than 0x7FFF are saturated to 0x7FFF. -/// Negative values less than 0x8000 are saturated to 0x8000. The converted -/// [2 x i16] values are written to the upper 32 bits of the result. -/// \returns A 64-bit integer vector of [4 x i16] containing the converted -/// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_packs_pi32(__m64 __m1, __m64 __m2) -{ - return (__m64)__builtin_ia32_packssdw((__v2si)__m1, (__v2si)__m2); -} - -/// Converts 16-bit signed integers from both 64-bit integer vector -/// parameters of [4 x i16] into 8-bit unsigned integer values, and -/// constructs a 64-bit integer vector of [8 x i8] as the result. Values -/// greater than 0xFF are saturated to 0xFF. Values less than 0 are saturated -/// to 0. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PACKUSWB instruction. -/// -/// \param __m1 -/// A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a -/// 16-bit signed integer and is converted to an 8-bit unsigned integer with -/// saturation. Values greater than 0xFF are saturated to 0xFF. Values less -/// than 0 are saturated to 0. The converted [4 x i8] values are written to -/// the lower 32 bits of the result. -/// \param __m2 -/// A 64-bit integer vector of [4 x i16]. Each 16-bit element is treated as a -/// 16-bit signed integer and is converted to an 8-bit unsigned integer with -/// saturation. Values greater than 0xFF are saturated to 0xFF. Values less -/// than 0 are saturated to 0. The converted [4 x i8] values are written to -/// the upper 32 bits of the result. -/// \returns A 64-bit integer vector of [8 x i8] containing the converted -/// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_packs_pu16(__m64 __m1, __m64 __m2) -{ - return (__m64)__builtin_ia32_packuswb((__v4hi)__m1, (__v4hi)__m2); -} - -/// Unpacks the upper 32 bits from two 64-bit integer vectors of [8 x i8] -/// and interleaves them into a 64-bit integer vector of [8 x i8]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PUNPCKHBW instruction. -/// -/// \param __m1 -/// A 64-bit integer vector of [8 x i8]. \n -/// Bits [39:32] are written to bits [7:0] of the result. \n -/// Bits [47:40] are written to bits [23:16] of the result. \n -/// Bits [55:48] are written to bits [39:32] of the result. \n -/// Bits [63:56] are written to bits [55:48] of the result. -/// \param __m2 -/// A 64-bit integer vector of [8 x i8]. -/// Bits [39:32] are written to bits [15:8] of the result. \n -/// Bits [47:40] are written to bits [31:24] of the result. \n -/// Bits [55:48] are written to bits [47:40] of the result. \n -/// Bits [63:56] are written to bits [63:56] of the result. -/// \returns A 64-bit integer vector of [8 x i8] containing the interleaved -/// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_unpackhi_pi8(__m64 __m1, __m64 __m2) -{ - return (__m64)__builtin_ia32_punpckhbw((__v8qi)__m1, (__v8qi)__m2); -} - -/// Unpacks the upper 32 bits from two 64-bit integer vectors of -/// [4 x i16] and interleaves them into a 64-bit integer vector of [4 x i16]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PUNPCKHWD instruction. -/// -/// \param __m1 -/// A 64-bit integer vector of [4 x i16]. -/// Bits [47:32] are written to bits [15:0] of the result. \n -/// Bits [63:48] are written to bits [47:32] of the result. -/// \param __m2 -/// A 64-bit integer vector of [4 x i16]. -/// Bits [47:32] are written to bits [31:16] of the result. \n -/// Bits [63:48] are written to bits [63:48] of the result. -/// \returns A 64-bit integer vector of [4 x i16] containing the interleaved -/// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_unpackhi_pi16(__m64 __m1, __m64 __m2) -{ - return (__m64)__builtin_ia32_punpckhwd((__v4hi)__m1, (__v4hi)__m2); -} - -/// Unpacks the upper 32 bits from two 64-bit integer vectors of -/// [2 x i32] and interleaves them into a 64-bit integer vector of [2 x i32]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PUNPCKHDQ instruction. -/// -/// \param __m1 -/// A 64-bit integer vector of [2 x i32]. The upper 32 bits are written to -/// the lower 32 bits of the result. -/// \param __m2 -/// A 64-bit integer vector of [2 x i32]. The upper 32 bits are written to -/// the upper 32 bits of the result. -/// \returns A 64-bit integer vector of [2 x i32] containing the interleaved -/// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_unpackhi_pi32(__m64 __m1, __m64 __m2) -{ - return (__m64)__builtin_ia32_punpckhdq((__v2si)__m1, (__v2si)__m2); -} - -/// Unpacks the lower 32 bits from two 64-bit integer vectors of [8 x i8] -/// and interleaves them into a 64-bit integer vector of [8 x i8]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PUNPCKLBW instruction. -/// -/// \param __m1 -/// A 64-bit integer vector of [8 x i8]. -/// Bits [7:0] are written to bits [7:0] of the result. \n -/// Bits [15:8] are written to bits [23:16] of the result. \n -/// Bits [23:16] are written to bits [39:32] of the result. \n -/// Bits [31:24] are written to bits [55:48] of the result. -/// \param __m2 -/// A 64-bit integer vector of [8 x i8]. -/// Bits [7:0] are written to bits [15:8] of the result. \n -/// Bits [15:8] are written to bits [31:24] of the result. \n -/// Bits [23:16] are written to bits [47:40] of the result. \n -/// Bits [31:24] are written to bits [63:56] of the result. -/// \returns A 64-bit integer vector of [8 x i8] containing the interleaved -/// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_unpacklo_pi8(__m64 __m1, __m64 __m2) -{ - return (__m64)__builtin_ia32_punpcklbw((__v8qi)__m1, (__v8qi)__m2); -} - -/// Unpacks the lower 32 bits from two 64-bit integer vectors of -/// [4 x i16] and interleaves them into a 64-bit integer vector of [4 x i16]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PUNPCKLWD instruction. -/// -/// \param __m1 -/// A 64-bit integer vector of [4 x i16]. -/// Bits [15:0] are written to bits [15:0] of the result. \n -/// Bits [31:16] are written to bits [47:32] of the result. -/// \param __m2 -/// A 64-bit integer vector of [4 x i16]. -/// Bits [15:0] are written to bits [31:16] of the result. \n -/// Bits [31:16] are written to bits [63:48] of the result. -/// \returns A 64-bit integer vector of [4 x i16] containing the interleaved -/// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_unpacklo_pi16(__m64 __m1, __m64 __m2) -{ - return (__m64)__builtin_ia32_punpcklwd((__v4hi)__m1, (__v4hi)__m2); -} - -/// Unpacks the lower 32 bits from two 64-bit integer vectors of -/// [2 x i32] and interleaves them into a 64-bit integer vector of [2 x i32]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PUNPCKLDQ instruction. -/// -/// \param __m1 -/// A 64-bit integer vector of [2 x i32]. The lower 32 bits are written to -/// the lower 32 bits of the result. -/// \param __m2 -/// A 64-bit integer vector of [2 x i32]. The lower 32 bits are written to -/// the upper 32 bits of the result. -/// \returns A 64-bit integer vector of [2 x i32] containing the interleaved -/// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_unpacklo_pi32(__m64 __m1, __m64 __m2) -{ - return (__m64)__builtin_ia32_punpckldq((__v2si)__m1, (__v2si)__m2); -} - -/// Adds each 8-bit integer element of the first 64-bit integer vector -/// of [8 x i8] to the corresponding 8-bit integer element of the second -/// 64-bit integer vector of [8 x i8]. The lower 8 bits of the results are -/// packed into a 64-bit integer vector of [8 x i8]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PADDB instruction. -/// -/// \param __m1 -/// A 64-bit integer vector of [8 x i8]. -/// \param __m2 -/// A 64-bit integer vector of [8 x i8]. -/// \returns A 64-bit integer vector of [8 x i8] containing the sums of both -/// parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_add_pi8(__m64 __m1, __m64 __m2) -{ - return (__m64)__builtin_ia32_paddb((__v8qi)__m1, (__v8qi)__m2); -} - -/// Adds each 16-bit integer element of the first 64-bit integer vector -/// of [4 x i16] to the corresponding 16-bit integer element of the second -/// 64-bit integer vector of [4 x i16]. The lower 16 bits of the results are -/// packed into a 64-bit integer vector of [4 x i16]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PADDW instruction. -/// -/// \param __m1 -/// A 64-bit integer vector of [4 x i16]. -/// \param __m2 -/// A 64-bit integer vector of [4 x i16]. -/// \returns A 64-bit integer vector of [4 x i16] containing the sums of both -/// parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_add_pi16(__m64 __m1, __m64 __m2) -{ - return (__m64)__builtin_ia32_paddw((__v4hi)__m1, (__v4hi)__m2); -} - -/// Adds each 32-bit integer element of the first 64-bit integer vector -/// of [2 x i32] to the corresponding 32-bit integer element of the second -/// 64-bit integer vector of [2 x i32]. The lower 32 bits of the results are -/// packed into a 64-bit integer vector of [2 x i32]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PADDD instruction. -/// -/// \param __m1 -/// A 64-bit integer vector of [2 x i32]. -/// \param __m2 -/// A 64-bit integer vector of [2 x i32]. -/// \returns A 64-bit integer vector of [2 x i32] containing the sums of both -/// parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_add_pi32(__m64 __m1, __m64 __m2) -{ - return (__m64)__builtin_ia32_paddd((__v2si)__m1, (__v2si)__m2); -} - -/// Adds each 8-bit signed integer element of the first 64-bit integer -/// vector of [8 x i8] to the corresponding 8-bit signed integer element of -/// the second 64-bit integer vector of [8 x i8]. Positive sums greater than -/// 0x7F are saturated to 0x7F. Negative sums less than 0x80 are saturated to -/// 0x80. The results are packed into a 64-bit integer vector of [8 x i8]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PADDSB instruction. -/// -/// \param __m1 -/// A 64-bit integer vector of [8 x i8]. -/// \param __m2 -/// A 64-bit integer vector of [8 x i8]. -/// \returns A 64-bit integer vector of [8 x i8] containing the saturated sums -/// of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_adds_pi8(__m64 __m1, __m64 __m2) -{ - return (__m64)__builtin_ia32_paddsb((__v8qi)__m1, (__v8qi)__m2); -} - -/// Adds each 16-bit signed integer element of the first 64-bit integer -/// vector of [4 x i16] to the corresponding 16-bit signed integer element of -/// the second 64-bit integer vector of [4 x i16]. Positive sums greater than -/// 0x7FFF are saturated to 0x7FFF. Negative sums less than 0x8000 are -/// saturated to 0x8000. The results are packed into a 64-bit integer vector -/// of [4 x i16]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PADDSW instruction. -/// -/// \param __m1 -/// A 64-bit integer vector of [4 x i16]. -/// \param __m2 -/// A 64-bit integer vector of [4 x i16]. -/// \returns A 64-bit integer vector of [4 x i16] containing the saturated sums -/// of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_adds_pi16(__m64 __m1, __m64 __m2) -{ - return (__m64)__builtin_ia32_paddsw((__v4hi)__m1, (__v4hi)__m2); -} - -/// Adds each 8-bit unsigned integer element of the first 64-bit integer -/// vector of [8 x i8] to the corresponding 8-bit unsigned integer element of -/// the second 64-bit integer vector of [8 x i8]. Sums greater than 0xFF are -/// saturated to 0xFF. The results are packed into a 64-bit integer vector of -/// [8 x i8]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PADDUSB instruction. -/// -/// \param __m1 -/// A 64-bit integer vector of [8 x i8]. -/// \param __m2 -/// A 64-bit integer vector of [8 x i8]. -/// \returns A 64-bit integer vector of [8 x i8] containing the saturated -/// unsigned sums of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_adds_pu8(__m64 __m1, __m64 __m2) -{ - return (__m64)__builtin_ia32_paddusb((__v8qi)__m1, (__v8qi)__m2); -} - -/// Adds each 16-bit unsigned integer element of the first 64-bit integer -/// vector of [4 x i16] to the corresponding 16-bit unsigned integer element -/// of the second 64-bit integer vector of [4 x i16]. Sums greater than -/// 0xFFFF are saturated to 0xFFFF. The results are packed into a 64-bit -/// integer vector of [4 x i16]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PADDUSW instruction. -/// -/// \param __m1 -/// A 64-bit integer vector of [4 x i16]. -/// \param __m2 -/// A 64-bit integer vector of [4 x i16]. -/// \returns A 64-bit integer vector of [4 x i16] containing the saturated -/// unsigned sums of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_adds_pu16(__m64 __m1, __m64 __m2) -{ - return (__m64)__builtin_ia32_paddusw((__v4hi)__m1, (__v4hi)__m2); -} - -/// Subtracts each 8-bit integer element of the second 64-bit integer -/// vector of [8 x i8] from the corresponding 8-bit integer element of the -/// first 64-bit integer vector of [8 x i8]. The lower 8 bits of the results -/// are packed into a 64-bit integer vector of [8 x i8]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PSUBB instruction. -/// -/// \param __m1 -/// A 64-bit integer vector of [8 x i8] containing the minuends. -/// \param __m2 -/// A 64-bit integer vector of [8 x i8] containing the subtrahends. -/// \returns A 64-bit integer vector of [8 x i8] containing the differences of -/// both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_sub_pi8(__m64 __m1, __m64 __m2) -{ - return (__m64)__builtin_ia32_psubb((__v8qi)__m1, (__v8qi)__m2); -} - -/// Subtracts each 16-bit integer element of the second 64-bit integer -/// vector of [4 x i16] from the corresponding 16-bit integer element of the -/// first 64-bit integer vector of [4 x i16]. The lower 16 bits of the -/// results are packed into a 64-bit integer vector of [4 x i16]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PSUBW instruction. -/// -/// \param __m1 -/// A 64-bit integer vector of [4 x i16] containing the minuends. -/// \param __m2 -/// A 64-bit integer vector of [4 x i16] containing the subtrahends. -/// \returns A 64-bit integer vector of [4 x i16] containing the differences of -/// both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_sub_pi16(__m64 __m1, __m64 __m2) -{ - return (__m64)__builtin_ia32_psubw((__v4hi)__m1, (__v4hi)__m2); -} - -/// Subtracts each 32-bit integer element of the second 64-bit integer -/// vector of [2 x i32] from the corresponding 32-bit integer element of the -/// first 64-bit integer vector of [2 x i32]. The lower 32 bits of the -/// results are packed into a 64-bit integer vector of [2 x i32]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PSUBD instruction. -/// -/// \param __m1 -/// A 64-bit integer vector of [2 x i32] containing the minuends. -/// \param __m2 -/// A 64-bit integer vector of [2 x i32] containing the subtrahends. -/// \returns A 64-bit integer vector of [2 x i32] containing the differences of -/// both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_sub_pi32(__m64 __m1, __m64 __m2) -{ - return (__m64)__builtin_ia32_psubd((__v2si)__m1, (__v2si)__m2); -} - -/// Subtracts each 8-bit signed integer element of the second 64-bit -/// integer vector of [8 x i8] from the corresponding 8-bit signed integer -/// element of the first 64-bit integer vector of [8 x i8]. Positive results -/// greater than 0x7F are saturated to 0x7F. Negative results less than 0x80 -/// are saturated to 0x80. The results are packed into a 64-bit integer -/// vector of [8 x i8]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PSUBSB instruction. -/// -/// \param __m1 -/// A 64-bit integer vector of [8 x i8] containing the minuends. -/// \param __m2 -/// A 64-bit integer vector of [8 x i8] containing the subtrahends. -/// \returns A 64-bit integer vector of [8 x i8] containing the saturated -/// differences of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_subs_pi8(__m64 __m1, __m64 __m2) -{ - return (__m64)__builtin_ia32_psubsb((__v8qi)__m1, (__v8qi)__m2); -} - -/// Subtracts each 16-bit signed integer element of the second 64-bit -/// integer vector of [4 x i16] from the corresponding 16-bit signed integer -/// element of the first 64-bit integer vector of [4 x i16]. Positive results -/// greater than 0x7FFF are saturated to 0x7FFF. Negative results less than -/// 0x8000 are saturated to 0x8000. The results are packed into a 64-bit -/// integer vector of [4 x i16]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PSUBSW instruction. -/// -/// \param __m1 -/// A 64-bit integer vector of [4 x i16] containing the minuends. -/// \param __m2 -/// A 64-bit integer vector of [4 x i16] containing the subtrahends. -/// \returns A 64-bit integer vector of [4 x i16] containing the saturated -/// differences of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_subs_pi16(__m64 __m1, __m64 __m2) -{ - return (__m64)__builtin_ia32_psubsw((__v4hi)__m1, (__v4hi)__m2); -} - -/// Subtracts each 8-bit unsigned integer element of the second 64-bit -/// integer vector of [8 x i8] from the corresponding 8-bit unsigned integer -/// element of the first 64-bit integer vector of [8 x i8]. -/// -/// If an element of the first vector is less than the corresponding element -/// of the second vector, the result is saturated to 0. The results are -/// packed into a 64-bit integer vector of [8 x i8]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PSUBUSB instruction. -/// -/// \param __m1 -/// A 64-bit integer vector of [8 x i8] containing the minuends. -/// \param __m2 -/// A 64-bit integer vector of [8 x i8] containing the subtrahends. -/// \returns A 64-bit integer vector of [8 x i8] containing the saturated -/// differences of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_subs_pu8(__m64 __m1, __m64 __m2) -{ - return (__m64)__builtin_ia32_psubusb((__v8qi)__m1, (__v8qi)__m2); -} - -/// Subtracts each 16-bit unsigned integer element of the second 64-bit -/// integer vector of [4 x i16] from the corresponding 16-bit unsigned -/// integer element of the first 64-bit integer vector of [4 x i16]. -/// -/// If an element of the first vector is less than the corresponding element -/// of the second vector, the result is saturated to 0. The results are -/// packed into a 64-bit integer vector of [4 x i16]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PSUBUSW instruction. -/// -/// \param __m1 -/// A 64-bit integer vector of [4 x i16] containing the minuends. -/// \param __m2 -/// A 64-bit integer vector of [4 x i16] containing the subtrahends. -/// \returns A 64-bit integer vector of [4 x i16] containing the saturated -/// differences of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_subs_pu16(__m64 __m1, __m64 __m2) -{ - return (__m64)__builtin_ia32_psubusw((__v4hi)__m1, (__v4hi)__m2); -} - -/// Multiplies each 16-bit signed integer element of the first 64-bit -/// integer vector of [4 x i16] by the corresponding 16-bit signed integer -/// element of the second 64-bit integer vector of [4 x i16] and get four -/// 32-bit products. Adds adjacent pairs of products to get two 32-bit sums. -/// The lower 32 bits of these two sums are packed into a 64-bit integer -/// vector of [2 x i32]. -/// -/// For example, bits [15:0] of both parameters are multiplied, bits [31:16] -/// of both parameters are multiplied, and the sum of both results is written -/// to bits [31:0] of the result. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PMADDWD instruction. -/// -/// \param __m1 -/// A 64-bit integer vector of [4 x i16]. -/// \param __m2 -/// A 64-bit integer vector of [4 x i16]. -/// \returns A 64-bit integer vector of [2 x i32] containing the sums of -/// products of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_madd_pi16(__m64 __m1, __m64 __m2) -{ - return (__m64)__builtin_ia32_pmaddwd((__v4hi)__m1, (__v4hi)__m2); -} - -/// Multiplies each 16-bit signed integer element of the first 64-bit -/// integer vector of [4 x i16] by the corresponding 16-bit signed integer -/// element of the second 64-bit integer vector of [4 x i16]. Packs the upper -/// 16 bits of the 32-bit products into a 64-bit integer vector of [4 x i16]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PMULHW instruction. -/// -/// \param __m1 -/// A 64-bit integer vector of [4 x i16]. -/// \param __m2 -/// A 64-bit integer vector of [4 x i16]. -/// \returns A 64-bit integer vector of [4 x i16] containing the upper 16 bits -/// of the products of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_mulhi_pi16(__m64 __m1, __m64 __m2) -{ - return (__m64)__builtin_ia32_pmulhw((__v4hi)__m1, (__v4hi)__m2); -} - -/// Multiplies each 16-bit signed integer element of the first 64-bit -/// integer vector of [4 x i16] by the corresponding 16-bit signed integer -/// element of the second 64-bit integer vector of [4 x i16]. Packs the lower -/// 16 bits of the 32-bit products into a 64-bit integer vector of [4 x i16]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PMULLW instruction. -/// -/// \param __m1 -/// A 64-bit integer vector of [4 x i16]. -/// \param __m2 -/// A 64-bit integer vector of [4 x i16]. -/// \returns A 64-bit integer vector of [4 x i16] containing the lower 16 bits -/// of the products of both parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_mullo_pi16(__m64 __m1, __m64 __m2) -{ - return (__m64)__builtin_ia32_pmullw((__v4hi)__m1, (__v4hi)__m2); -} - -/// Left-shifts each 16-bit signed integer element of the first -/// parameter, which is a 64-bit integer vector of [4 x i16], by the number -/// of bits specified by the second parameter, which is a 64-bit integer. The -/// lower 16 bits of the results are packed into a 64-bit integer vector of -/// [4 x i16]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PSLLW instruction. -/// -/// \param __m -/// A 64-bit integer vector of [4 x i16]. -/// \param __count -/// A 64-bit integer vector interpreted as a single 64-bit integer. -/// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted -/// values. If \a __count is greater or equal to 16, the result is set to all -/// 0. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_sll_pi16(__m64 __m, __m64 __count) -{ - return (__m64)__builtin_ia32_psllw((__v4hi)__m, __count); -} - -/// Left-shifts each 16-bit signed integer element of a 64-bit integer -/// vector of [4 x i16] by the number of bits specified by a 32-bit integer. -/// The lower 16 bits of the results are packed into a 64-bit integer vector -/// of [4 x i16]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PSLLW instruction. -/// -/// \param __m -/// A 64-bit integer vector of [4 x i16]. -/// \param __count -/// A 32-bit integer value. -/// \returns A 64-bit integer vector of [4 x i16] containing the left-shifted -/// values. If \a __count is greater or equal to 16, the result is set to all -/// 0. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_slli_pi16(__m64 __m, int __count) -{ - return (__m64)__builtin_ia32_psllwi((__v4hi)__m, __count); -} - -/// Left-shifts each 32-bit signed integer element of the first -/// parameter, which is a 64-bit integer vector of [2 x i32], by the number -/// of bits specified by the second parameter, which is a 64-bit integer. The -/// lower 32 bits of the results are packed into a 64-bit integer vector of -/// [2 x i32]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PSLLD instruction. -/// -/// \param __m -/// A 64-bit integer vector of [2 x i32]. -/// \param __count -/// A 64-bit integer vector interpreted as a single 64-bit integer. -/// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted -/// values. If \a __count is greater or equal to 32, the result is set to all -/// 0. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_sll_pi32(__m64 __m, __m64 __count) -{ - return (__m64)__builtin_ia32_pslld((__v2si)__m, __count); -} - -/// Left-shifts each 32-bit signed integer element of a 64-bit integer -/// vector of [2 x i32] by the number of bits specified by a 32-bit integer. -/// The lower 32 bits of the results are packed into a 64-bit integer vector -/// of [2 x i32]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PSLLD instruction. -/// -/// \param __m -/// A 64-bit integer vector of [2 x i32]. -/// \param __count -/// A 32-bit integer value. -/// \returns A 64-bit integer vector of [2 x i32] containing the left-shifted -/// values. If \a __count is greater or equal to 32, the result is set to all -/// 0. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_slli_pi32(__m64 __m, int __count) -{ - return (__m64)__builtin_ia32_pslldi((__v2si)__m, __count); -} - -/// Left-shifts the first 64-bit integer parameter by the number of bits -/// specified by the second 64-bit integer parameter. The lower 64 bits of -/// result are returned. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PSLLQ instruction. -/// -/// \param __m -/// A 64-bit integer vector interpreted as a single 64-bit integer. -/// \param __count -/// A 64-bit integer vector interpreted as a single 64-bit integer. -/// \returns A 64-bit integer vector containing the left-shifted value. If -/// \a __count is greater or equal to 64, the result is set to 0. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_sll_si64(__m64 __m, __m64 __count) -{ - return (__m64)__builtin_ia32_psllq((__v1di)__m, __count); -} - -/// Left-shifts the first parameter, which is a 64-bit integer, by the -/// number of bits specified by the second parameter, which is a 32-bit -/// integer. The lower 64 bits of result are returned. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PSLLQ instruction. -/// -/// \param __m -/// A 64-bit integer vector interpreted as a single 64-bit integer. -/// \param __count -/// A 32-bit integer value. -/// \returns A 64-bit integer vector containing the left-shifted value. If -/// \a __count is greater or equal to 64, the result is set to 0. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_slli_si64(__m64 __m, int __count) -{ - return (__m64)__builtin_ia32_psllqi((__v1di)__m, __count); -} - -/// Right-shifts each 16-bit integer element of the first parameter, -/// which is a 64-bit integer vector of [4 x i16], by the number of bits -/// specified by the second parameter, which is a 64-bit integer. -/// -/// High-order bits are filled with the sign bit of the initial value of each -/// 16-bit element. The 16-bit results are packed into a 64-bit integer -/// vector of [4 x i16]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PSRAW instruction. -/// -/// \param __m -/// A 64-bit integer vector of [4 x i16]. -/// \param __count -/// A 64-bit integer vector interpreted as a single 64-bit integer. -/// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted -/// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_sra_pi16(__m64 __m, __m64 __count) -{ - return (__m64)__builtin_ia32_psraw((__v4hi)__m, __count); -} - -/// Right-shifts each 16-bit integer element of a 64-bit integer vector -/// of [4 x i16] by the number of bits specified by a 32-bit integer. -/// -/// High-order bits are filled with the sign bit of the initial value of each -/// 16-bit element. The 16-bit results are packed into a 64-bit integer -/// vector of [4 x i16]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PSRAW instruction. -/// -/// \param __m -/// A 64-bit integer vector of [4 x i16]. -/// \param __count -/// A 32-bit integer value. -/// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted -/// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_srai_pi16(__m64 __m, int __count) -{ - return (__m64)__builtin_ia32_psrawi((__v4hi)__m, __count); -} - -/// Right-shifts each 32-bit integer element of the first parameter, -/// which is a 64-bit integer vector of [2 x i32], by the number of bits -/// specified by the second parameter, which is a 64-bit integer. -/// -/// High-order bits are filled with the sign bit of the initial value of each -/// 32-bit element. The 32-bit results are packed into a 64-bit integer -/// vector of [2 x i32]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PSRAD instruction. -/// -/// \param __m -/// A 64-bit integer vector of [2 x i32]. -/// \param __count -/// A 64-bit integer vector interpreted as a single 64-bit integer. -/// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted -/// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_sra_pi32(__m64 __m, __m64 __count) -{ - return (__m64)__builtin_ia32_psrad((__v2si)__m, __count); -} - -/// Right-shifts each 32-bit integer element of a 64-bit integer vector -/// of [2 x i32] by the number of bits specified by a 32-bit integer. -/// -/// High-order bits are filled with the sign bit of the initial value of each -/// 32-bit element. The 32-bit results are packed into a 64-bit integer -/// vector of [2 x i32]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PSRAD instruction. -/// -/// \param __m -/// A 64-bit integer vector of [2 x i32]. -/// \param __count -/// A 32-bit integer value. -/// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted -/// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_srai_pi32(__m64 __m, int __count) -{ - return (__m64)__builtin_ia32_psradi((__v2si)__m, __count); -} - -/// Right-shifts each 16-bit integer element of the first parameter, -/// which is a 64-bit integer vector of [4 x i16], by the number of bits -/// specified by the second parameter, which is a 64-bit integer. -/// -/// High-order bits are cleared. The 16-bit results are packed into a 64-bit -/// integer vector of [4 x i16]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PSRLW instruction. -/// -/// \param __m -/// A 64-bit integer vector of [4 x i16]. -/// \param __count -/// A 64-bit integer vector interpreted as a single 64-bit integer. -/// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted -/// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_srl_pi16(__m64 __m, __m64 __count) -{ - return (__m64)__builtin_ia32_psrlw((__v4hi)__m, __count); -} - -/// Right-shifts each 16-bit integer element of a 64-bit integer vector -/// of [4 x i16] by the number of bits specified by a 32-bit integer. -/// -/// High-order bits are cleared. The 16-bit results are packed into a 64-bit -/// integer vector of [4 x i16]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PSRLW instruction. -/// -/// \param __m -/// A 64-bit integer vector of [4 x i16]. -/// \param __count -/// A 32-bit integer value. -/// \returns A 64-bit integer vector of [4 x i16] containing the right-shifted -/// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_srli_pi16(__m64 __m, int __count) -{ - return (__m64)__builtin_ia32_psrlwi((__v4hi)__m, __count); -} - -/// Right-shifts each 32-bit integer element of the first parameter, -/// which is a 64-bit integer vector of [2 x i32], by the number of bits -/// specified by the second parameter, which is a 64-bit integer. -/// -/// High-order bits are cleared. The 32-bit results are packed into a 64-bit -/// integer vector of [2 x i32]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PSRLD instruction. -/// -/// \param __m -/// A 64-bit integer vector of [2 x i32]. -/// \param __count -/// A 64-bit integer vector interpreted as a single 64-bit integer. -/// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted -/// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_srl_pi32(__m64 __m, __m64 __count) -{ - return (__m64)__builtin_ia32_psrld((__v2si)__m, __count); -} - -/// Right-shifts each 32-bit integer element of a 64-bit integer vector -/// of [2 x i32] by the number of bits specified by a 32-bit integer. -/// -/// High-order bits are cleared. The 32-bit results are packed into a 64-bit -/// integer vector of [2 x i32]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PSRLD instruction. -/// -/// \param __m -/// A 64-bit integer vector of [2 x i32]. -/// \param __count -/// A 32-bit integer value. -/// \returns A 64-bit integer vector of [2 x i32] containing the right-shifted -/// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_srli_pi32(__m64 __m, int __count) -{ - return (__m64)__builtin_ia32_psrldi((__v2si)__m, __count); -} - -/// Right-shifts the first 64-bit integer parameter by the number of bits -/// specified by the second 64-bit integer parameter. -/// -/// High-order bits are cleared. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PSRLQ instruction. -/// -/// \param __m -/// A 64-bit integer vector interpreted as a single 64-bit integer. -/// \param __count -/// A 64-bit integer vector interpreted as a single 64-bit integer. -/// \returns A 64-bit integer vector containing the right-shifted value. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_srl_si64(__m64 __m, __m64 __count) -{ - return (__m64)__builtin_ia32_psrlq((__v1di)__m, __count); -} - -/// Right-shifts the first parameter, which is a 64-bit integer, by the -/// number of bits specified by the second parameter, which is a 32-bit -/// integer. -/// -/// High-order bits are cleared. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PSRLQ instruction. -/// -/// \param __m -/// A 64-bit integer vector interpreted as a single 64-bit integer. -/// \param __count -/// A 32-bit integer value. -/// \returns A 64-bit integer vector containing the right-shifted value. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_srli_si64(__m64 __m, int __count) -{ - return (__m64)__builtin_ia32_psrlqi((__v1di)__m, __count); -} - -/// Performs a bitwise AND of two 64-bit integer vectors. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PAND instruction. -/// -/// \param __m1 -/// A 64-bit integer vector. -/// \param __m2 -/// A 64-bit integer vector. -/// \returns A 64-bit integer vector containing the bitwise AND of both -/// parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_and_si64(__m64 __m1, __m64 __m2) -{ - return __builtin_ia32_pand((__v1di)__m1, (__v1di)__m2); -} - -/// Performs a bitwise NOT of the first 64-bit integer vector, and then -/// performs a bitwise AND of the intermediate result and the second 64-bit -/// integer vector. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PANDN instruction. -/// -/// \param __m1 -/// A 64-bit integer vector. The one's complement of this parameter is used -/// in the bitwise AND. -/// \param __m2 -/// A 64-bit integer vector. -/// \returns A 64-bit integer vector containing the bitwise AND of the second -/// parameter and the one's complement of the first parameter. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_andnot_si64(__m64 __m1, __m64 __m2) -{ - return __builtin_ia32_pandn((__v1di)__m1, (__v1di)__m2); -} - -/// Performs a bitwise OR of two 64-bit integer vectors. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the POR instruction. -/// -/// \param __m1 -/// A 64-bit integer vector. -/// \param __m2 -/// A 64-bit integer vector. -/// \returns A 64-bit integer vector containing the bitwise OR of both -/// parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_or_si64(__m64 __m1, __m64 __m2) -{ - return __builtin_ia32_por((__v1di)__m1, (__v1di)__m2); -} - -/// Performs a bitwise exclusive OR of two 64-bit integer vectors. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PXOR instruction. -/// -/// \param __m1 -/// A 64-bit integer vector. -/// \param __m2 -/// A 64-bit integer vector. -/// \returns A 64-bit integer vector containing the bitwise exclusive OR of both -/// parameters. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_xor_si64(__m64 __m1, __m64 __m2) -{ - return __builtin_ia32_pxor((__v1di)__m1, (__v1di)__m2); -} - -/// Compares the 8-bit integer elements of two 64-bit integer vectors of -/// [8 x i8] to determine if the element of the first vector is equal to the -/// corresponding element of the second vector. -/// -/// The comparison yields 0 for false, 0xFF for true. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PCMPEQB instruction. -/// -/// \param __m1 -/// A 64-bit integer vector of [8 x i8]. -/// \param __m2 -/// A 64-bit integer vector of [8 x i8]. -/// \returns A 64-bit integer vector of [8 x i8] containing the comparison -/// results. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_cmpeq_pi8(__m64 __m1, __m64 __m2) -{ - return (__m64)__builtin_ia32_pcmpeqb((__v8qi)__m1, (__v8qi)__m2); -} - -/// Compares the 16-bit integer elements of two 64-bit integer vectors of -/// [4 x i16] to determine if the element of the first vector is equal to the -/// corresponding element of the second vector. -/// -/// The comparison yields 0 for false, 0xFFFF for true. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PCMPEQW instruction. -/// -/// \param __m1 -/// A 64-bit integer vector of [4 x i16]. -/// \param __m2 -/// A 64-bit integer vector of [4 x i16]. -/// \returns A 64-bit integer vector of [4 x i16] containing the comparison -/// results. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_cmpeq_pi16(__m64 __m1, __m64 __m2) -{ - return (__m64)__builtin_ia32_pcmpeqw((__v4hi)__m1, (__v4hi)__m2); -} - -/// Compares the 32-bit integer elements of two 64-bit integer vectors of -/// [2 x i32] to determine if the element of the first vector is equal to the -/// corresponding element of the second vector. -/// -/// The comparison yields 0 for false, 0xFFFFFFFF for true. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PCMPEQD instruction. -/// -/// \param __m1 -/// A 64-bit integer vector of [2 x i32]. -/// \param __m2 -/// A 64-bit integer vector of [2 x i32]. -/// \returns A 64-bit integer vector of [2 x i32] containing the comparison -/// results. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_cmpeq_pi32(__m64 __m1, __m64 __m2) -{ - return (__m64)__builtin_ia32_pcmpeqd((__v2si)__m1, (__v2si)__m2); -} - -/// Compares the 8-bit integer elements of two 64-bit integer vectors of -/// [8 x i8] to determine if the element of the first vector is greater than -/// the corresponding element of the second vector. -/// -/// The comparison yields 0 for false, 0xFF for true. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PCMPGTB instruction. -/// -/// \param __m1 -/// A 64-bit integer vector of [8 x i8]. -/// \param __m2 -/// A 64-bit integer vector of [8 x i8]. -/// \returns A 64-bit integer vector of [8 x i8] containing the comparison -/// results. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_cmpgt_pi8(__m64 __m1, __m64 __m2) -{ - return (__m64)__builtin_ia32_pcmpgtb((__v8qi)__m1, (__v8qi)__m2); -} - -/// Compares the 16-bit integer elements of two 64-bit integer vectors of -/// [4 x i16] to determine if the element of the first vector is greater than -/// the corresponding element of the second vector. -/// -/// The comparison yields 0 for false, 0xFFFF for true. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PCMPGTW instruction. -/// -/// \param __m1 -/// A 64-bit integer vector of [4 x i16]. -/// \param __m2 -/// A 64-bit integer vector of [4 x i16]. -/// \returns A 64-bit integer vector of [4 x i16] containing the comparison -/// results. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_cmpgt_pi16(__m64 __m1, __m64 __m2) -{ - return (__m64)__builtin_ia32_pcmpgtw((__v4hi)__m1, (__v4hi)__m2); -} - -/// Compares the 32-bit integer elements of two 64-bit integer vectors of -/// [2 x i32] to determine if the element of the first vector is greater than -/// the corresponding element of the second vector. -/// -/// The comparison yields 0 for false, 0xFFFFFFFF for true. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PCMPGTD instruction. -/// -/// \param __m1 -/// A 64-bit integer vector of [2 x i32]. -/// \param __m2 -/// A 64-bit integer vector of [2 x i32]. -/// \returns A 64-bit integer vector of [2 x i32] containing the comparison -/// results. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_cmpgt_pi32(__m64 __m1, __m64 __m2) -{ - return (__m64)__builtin_ia32_pcmpgtd((__v2si)__m1, (__v2si)__m2); -} - -/// Constructs a 64-bit integer vector initialized to zero. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PXOR instruction. -/// -/// \returns An initialized 64-bit integer vector with all elements set to zero. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_setzero_si64(void) -{ - return __extension__ (__m64){ 0LL }; -} - -/// Constructs a 64-bit integer vector initialized with the specified -/// 32-bit integer values. -/// -/// \headerfile -/// -/// This intrinsic is a utility function and does not correspond to a specific -/// instruction. -/// -/// \param __i1 -/// A 32-bit integer value used to initialize the upper 32 bits of the -/// result. -/// \param __i0 -/// A 32-bit integer value used to initialize the lower 32 bits of the -/// result. -/// \returns An initialized 64-bit integer vector. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_set_pi32(int __i1, int __i0) -{ - return (__m64)__builtin_ia32_vec_init_v2si(__i0, __i1); -} - -/// Constructs a 64-bit integer vector initialized with the specified -/// 16-bit integer values. -/// -/// \headerfile -/// -/// This intrinsic is a utility function and does not correspond to a specific -/// instruction. -/// -/// \param __s3 -/// A 16-bit integer value used to initialize bits [63:48] of the result. -/// \param __s2 -/// A 16-bit integer value used to initialize bits [47:32] of the result. -/// \param __s1 -/// A 16-bit integer value used to initialize bits [31:16] of the result. -/// \param __s0 -/// A 16-bit integer value used to initialize bits [15:0] of the result. -/// \returns An initialized 64-bit integer vector. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_set_pi16(short __s3, short __s2, short __s1, short __s0) -{ - return (__m64)__builtin_ia32_vec_init_v4hi(__s0, __s1, __s2, __s3); -} - -/// Constructs a 64-bit integer vector initialized with the specified -/// 8-bit integer values. -/// -/// \headerfile -/// -/// This intrinsic is a utility function and does not correspond to a specific -/// instruction. -/// -/// \param __b7 -/// An 8-bit integer value used to initialize bits [63:56] of the result. -/// \param __b6 -/// An 8-bit integer value used to initialize bits [55:48] of the result. -/// \param __b5 -/// An 8-bit integer value used to initialize bits [47:40] of the result. -/// \param __b4 -/// An 8-bit integer value used to initialize bits [39:32] of the result. -/// \param __b3 -/// An 8-bit integer value used to initialize bits [31:24] of the result. -/// \param __b2 -/// An 8-bit integer value used to initialize bits [23:16] of the result. -/// \param __b1 -/// An 8-bit integer value used to initialize bits [15:8] of the result. -/// \param __b0 -/// An 8-bit integer value used to initialize bits [7:0] of the result. -/// \returns An initialized 64-bit integer vector. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_set_pi8(char __b7, char __b6, char __b5, char __b4, char __b3, char __b2, - char __b1, char __b0) -{ - return (__m64)__builtin_ia32_vec_init_v8qi(__b0, __b1, __b2, __b3, - __b4, __b5, __b6, __b7); -} - -/// Constructs a 64-bit integer vector of [2 x i32], with each of the -/// 32-bit integer vector elements set to the specified 32-bit integer -/// value. -/// -/// \headerfile -/// -/// This intrinsic is a utility function and does not correspond to a specific -/// instruction. -/// -/// \param __i -/// A 32-bit integer value used to initialize each vector element of the -/// result. -/// \returns An initialized 64-bit integer vector of [2 x i32]. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_set1_pi32(int __i) -{ - return _mm_set_pi32(__i, __i); -} - -/// Constructs a 64-bit integer vector of [4 x i16], with each of the -/// 16-bit integer vector elements set to the specified 16-bit integer -/// value. -/// -/// \headerfile -/// -/// This intrinsic is a utility function and does not correspond to a specific -/// instruction. -/// -/// \param __w -/// A 16-bit integer value used to initialize each vector element of the -/// result. -/// \returns An initialized 64-bit integer vector of [4 x i16]. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_set1_pi16(short __w) -{ - return _mm_set_pi16(__w, __w, __w, __w); -} - -/// Constructs a 64-bit integer vector of [8 x i8], with each of the -/// 8-bit integer vector elements set to the specified 8-bit integer value. -/// -/// \headerfile -/// -/// This intrinsic is a utility function and does not correspond to a specific -/// instruction. -/// -/// \param __b -/// An 8-bit integer value used to initialize each vector element of the -/// result. -/// \returns An initialized 64-bit integer vector of [8 x i8]. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_set1_pi8(char __b) -{ - return _mm_set_pi8(__b, __b, __b, __b, __b, __b, __b, __b); -} - -/// Constructs a 64-bit integer vector, initialized in reverse order with -/// the specified 32-bit integer values. -/// -/// \headerfile -/// -/// This intrinsic is a utility function and does not correspond to a specific -/// instruction. -/// -/// \param __i0 -/// A 32-bit integer value used to initialize the lower 32 bits of the -/// result. -/// \param __i1 -/// A 32-bit integer value used to initialize the upper 32 bits of the -/// result. -/// \returns An initialized 64-bit integer vector. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_setr_pi32(int __i0, int __i1) -{ - return _mm_set_pi32(__i1, __i0); -} - -/// Constructs a 64-bit integer vector, initialized in reverse order with -/// the specified 16-bit integer values. -/// -/// \headerfile -/// -/// This intrinsic is a utility function and does not correspond to a specific -/// instruction. -/// -/// \param __w0 -/// A 16-bit integer value used to initialize bits [15:0] of the result. -/// \param __w1 -/// A 16-bit integer value used to initialize bits [31:16] of the result. -/// \param __w2 -/// A 16-bit integer value used to initialize bits [47:32] of the result. -/// \param __w3 -/// A 16-bit integer value used to initialize bits [63:48] of the result. -/// \returns An initialized 64-bit integer vector. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_setr_pi16(short __w0, short __w1, short __w2, short __w3) -{ - return _mm_set_pi16(__w3, __w2, __w1, __w0); -} - -/// Constructs a 64-bit integer vector, initialized in reverse order with -/// the specified 8-bit integer values. -/// -/// \headerfile -/// -/// This intrinsic is a utility function and does not correspond to a specific -/// instruction. -/// -/// \param __b0 -/// An 8-bit integer value used to initialize bits [7:0] of the result. -/// \param __b1 -/// An 8-bit integer value used to initialize bits [15:8] of the result. -/// \param __b2 -/// An 8-bit integer value used to initialize bits [23:16] of the result. -/// \param __b3 -/// An 8-bit integer value used to initialize bits [31:24] of the result. -/// \param __b4 -/// An 8-bit integer value used to initialize bits [39:32] of the result. -/// \param __b5 -/// An 8-bit integer value used to initialize bits [47:40] of the result. -/// \param __b6 -/// An 8-bit integer value used to initialize bits [55:48] of the result. -/// \param __b7 -/// An 8-bit integer value used to initialize bits [63:56] of the result. -/// \returns An initialized 64-bit integer vector. -static __inline__ __m64 __DEFAULT_FN_ATTRS -_mm_setr_pi8(char __b0, char __b1, char __b2, char __b3, char __b4, char __b5, - char __b6, char __b7) -{ - return _mm_set_pi8(__b7, __b6, __b5, __b4, __b3, __b2, __b1, __b0); -} - -#undef __DEFAULT_FN_ATTRS - -/* Aliases for compatibility. */ -#define _m_empty _mm_empty -#define _m_from_int _mm_cvtsi32_si64 -#define _m_from_int64 _mm_cvtsi64_m64 -#define _m_to_int _mm_cvtsi64_si32 -#define _m_to_int64 _mm_cvtm64_si64 -#define _m_packsswb _mm_packs_pi16 -#define _m_packssdw _mm_packs_pi32 -#define _m_packuswb _mm_packs_pu16 -#define _m_punpckhbw _mm_unpackhi_pi8 -#define _m_punpckhwd _mm_unpackhi_pi16 -#define _m_punpckhdq _mm_unpackhi_pi32 -#define _m_punpcklbw _mm_unpacklo_pi8 -#define _m_punpcklwd _mm_unpacklo_pi16 -#define _m_punpckldq _mm_unpacklo_pi32 -#define _m_paddb _mm_add_pi8 -#define _m_paddw _mm_add_pi16 -#define _m_paddd _mm_add_pi32 -#define _m_paddsb _mm_adds_pi8 -#define _m_paddsw _mm_adds_pi16 -#define _m_paddusb _mm_adds_pu8 -#define _m_paddusw _mm_adds_pu16 -#define _m_psubb _mm_sub_pi8 -#define _m_psubw _mm_sub_pi16 -#define _m_psubd _mm_sub_pi32 -#define _m_psubsb _mm_subs_pi8 -#define _m_psubsw _mm_subs_pi16 -#define _m_psubusb _mm_subs_pu8 -#define _m_psubusw _mm_subs_pu16 -#define _m_pmaddwd _mm_madd_pi16 -#define _m_pmulhw _mm_mulhi_pi16 -#define _m_pmullw _mm_mullo_pi16 -#define _m_psllw _mm_sll_pi16 -#define _m_psllwi _mm_slli_pi16 -#define _m_pslld _mm_sll_pi32 -#define _m_pslldi _mm_slli_pi32 -#define _m_psllq _mm_sll_si64 -#define _m_psllqi _mm_slli_si64 -#define _m_psraw _mm_sra_pi16 -#define _m_psrawi _mm_srai_pi16 -#define _m_psrad _mm_sra_pi32 -#define _m_psradi _mm_srai_pi32 -#define _m_psrlw _mm_srl_pi16 -#define _m_psrlwi _mm_srli_pi16 -#define _m_psrld _mm_srl_pi32 -#define _m_psrldi _mm_srli_pi32 -#define _m_psrlq _mm_srl_si64 -#define _m_psrlqi _mm_srli_si64 -#define _m_pand _mm_and_si64 -#define _m_pandn _mm_andnot_si64 -#define _m_por _mm_or_si64 -#define _m_pxor _mm_xor_si64 -#define _m_pcmpeqb _mm_cmpeq_pi8 -#define _m_pcmpeqw _mm_cmpeq_pi16 -#define _m_pcmpeqd _mm_cmpeq_pi32 -#define _m_pcmpgtb _mm_cmpgt_pi8 -#define _m_pcmpgtw _mm_cmpgt_pi16 -#define _m_pcmpgtd _mm_cmpgt_pi32 - -#endif /* __MMINTRIN_H */ - diff --git a/include/movdirintrin.h b/include/movdirintrin.h deleted file mode 100644 index 30c4d02..0000000 --- a/include/movdirintrin.h +++ /dev/null @@ -1,49 +0,0 @@ -/*===------------------------- movdirintrin.h ------------------------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ -#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef _MOVDIRINTRIN_H -#define _MOVDIRINTRIN_H - -/* Move doubleword as direct store */ -static __inline__ void -__attribute__((__always_inline__, __nodebug__, __target__("movdiri"))) -_directstoreu_u32 (void *__dst, unsigned int __value) -{ - __builtin_ia32_directstore_u32((unsigned int *)__dst, (unsigned int)__value); -} - -#ifdef __x86_64__ - -/* Move quadword as direct store */ -static __inline__ void -__attribute__((__always_inline__, __nodebug__, __target__("movdiri"))) -_directstoreu_u64 (void *__dst, unsigned long __value) -{ - __builtin_ia32_directstore_u64((unsigned long *)__dst, __value); -} - -#endif /* __x86_64__ */ - -/* - * movdir64b - Move 64 bytes as direct store. - * The destination must be 64 byte aligned, and the store is atomic. - * The source address has no alignment requirement, and the load from - * the source address is not atomic. - */ -static __inline__ void -__attribute__((__always_inline__, __nodebug__, __target__("movdir64b"))) -_movdir64b (void *__dst __attribute__((align_value(64))), const void *__src) -{ - __builtin_ia32_movdir64b(__dst, __src); -} - -#endif /* _MOVDIRINTRIN_H */ diff --git a/include/mwaitxintrin.h b/include/mwaitxintrin.h deleted file mode 100644 index ed48538..0000000 --- a/include/mwaitxintrin.h +++ /dev/null @@ -1,33 +0,0 @@ -/*===---- mwaitxintrin.h - MONITORX/MWAITX intrinsics ----------------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ - -#ifndef __X86INTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __MWAITXINTRIN_H -#define __MWAITXINTRIN_H - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("mwaitx"))) -static __inline__ void __DEFAULT_FN_ATTRS -_mm_monitorx(void * __p, unsigned __extensions, unsigned __hints) -{ - __builtin_ia32_monitorx(__p, __extensions, __hints); -} - -static __inline__ void __DEFAULT_FN_ATTRS -_mm_mwaitx(unsigned __extensions, unsigned __hints, unsigned __clock) -{ - __builtin_ia32_mwaitx(__extensions, __hints, __clock); -} - -#undef __DEFAULT_FN_ATTRS - -#endif /* __MWAITXINTRIN_H */ diff --git a/include/nmmintrin.h b/include/nmmintrin.h deleted file mode 100644 index 59fc7ec..0000000 --- a/include/nmmintrin.h +++ /dev/null @@ -1,20 +0,0 @@ -/*===---- nmmintrin.h - SSE4 intrinsics ------------------------------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ - -#ifndef __NMMINTRIN_H -#define __NMMINTRIN_H - -#if !defined(__i386__) && !defined(__x86_64__) -#error "This header is only meant to be used on x86 and x64 architecture" -#endif - -/* To match expectations of gcc we put the sse4.2 definitions into smmintrin.h, - just include it now then. */ -#include -#endif /* __NMMINTRIN_H */ diff --git a/include/pconfigintrin.h b/include/pconfigintrin.h deleted file mode 100644 index d2014b0..0000000 --- a/include/pconfigintrin.h +++ /dev/null @@ -1,40 +0,0 @@ -/*===---- pconfigintrin.h - X86 platform configuration ---------------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ - -#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __PCONFIGINTRIN_H -#define __PCONFIGINTRIN_H - -#define __PCONFIG_KEY_PROGRAM 0x00000001 - -#if __has_extension(gnu_asm) - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS \ - __attribute__((__always_inline__, __nodebug__, __target__("pconfig"))) - -static __inline unsigned int __DEFAULT_FN_ATTRS -_pconfig_u32(unsigned int __leaf, __SIZE_TYPE__ __d[]) -{ - unsigned int __result; - __asm__ ("pconfig" - : "=a" (__result), "=b" (__d[0]), "=c" (__d[1]), "=d" (__d[2]) - : "a" (__leaf), "b" (__d[0]), "c" (__d[1]), "d" (__d[2]) - : "cc"); - return __result; -} - -#undef __DEFAULT_FN_ATTRS - -#endif /* __has_extension(gnu_asm) */ - -#endif diff --git a/include/pkuintrin.h b/include/pkuintrin.h deleted file mode 100644 index c62080b..0000000 --- a/include/pkuintrin.h +++ /dev/null @@ -1,34 +0,0 @@ -/*===---- pkuintrin.h - PKU intrinsics -------------------------------------=== - * - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ -#ifndef __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __PKUINTRIN_H -#define __PKUINTRIN_H - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("pku"))) - -static __inline__ unsigned int __DEFAULT_FN_ATTRS -_rdpkru_u32(void) -{ - return __builtin_ia32_rdpkru(); -} - -static __inline__ void __DEFAULT_FN_ATTRS -_wrpkru(unsigned int __val) -{ - __builtin_ia32_wrpkru(__val); -} - -#undef __DEFAULT_FN_ATTRS - -#endif diff --git a/include/pmmintrin.h b/include/pmmintrin.h deleted file mode 100644 index eda8356..0000000 --- a/include/pmmintrin.h +++ /dev/null @@ -1,294 +0,0 @@ -/*===---- pmmintrin.h - SSE3 intrinsics ------------------------------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ - -#ifndef __PMMINTRIN_H -#define __PMMINTRIN_H - -#if !defined(__i386__) && !defined(__x86_64__) -#error "This header is only meant to be used on x86 and x64 architecture" -#endif - -#include - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS \ - __attribute__((__always_inline__, __nodebug__, __target__("sse3"), __min_vector_width__(128))) - -/// Loads data from an unaligned memory location to elements in a 128-bit -/// vector. -/// -/// If the address of the data is not 16-byte aligned, the instruction may -/// read two adjacent aligned blocks of memory to retrieve the requested -/// data. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VLDDQU instruction. -/// -/// \param __p -/// A pointer to a 128-bit integer vector containing integer values. -/// \returns A 128-bit vector containing the moved values. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_lddqu_si128(__m128i const *__p) -{ - return (__m128i)__builtin_ia32_lddqu((char const *)__p); -} - -/// Adds the even-indexed values and subtracts the odd-indexed values of -/// two 128-bit vectors of [4 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VADDSUBPS instruction. -/// -/// \param __a -/// A 128-bit vector of [4 x float] containing the left source operand. -/// \param __b -/// A 128-bit vector of [4 x float] containing the right source operand. -/// \returns A 128-bit vector of [4 x float] containing the alternating sums and -/// differences of both operands. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_addsub_ps(__m128 __a, __m128 __b) -{ - return __builtin_ia32_addsubps((__v4sf)__a, (__v4sf)__b); -} - -/// Horizontally adds the adjacent pairs of values contained in two -/// 128-bit vectors of [4 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VHADDPS instruction. -/// -/// \param __a -/// A 128-bit vector of [4 x float] containing one of the source operands. -/// The horizontal sums of the values are stored in the lower bits of the -/// destination. -/// \param __b -/// A 128-bit vector of [4 x float] containing one of the source operands. -/// The horizontal sums of the values are stored in the upper bits of the -/// destination. -/// \returns A 128-bit vector of [4 x float] containing the horizontal sums of -/// both operands. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_hadd_ps(__m128 __a, __m128 __b) -{ - return __builtin_ia32_haddps((__v4sf)__a, (__v4sf)__b); -} - -/// Horizontally subtracts the adjacent pairs of values contained in two -/// 128-bit vectors of [4 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VHSUBPS instruction. -/// -/// \param __a -/// A 128-bit vector of [4 x float] containing one of the source operands. -/// The horizontal differences between the values are stored in the lower -/// bits of the destination. -/// \param __b -/// A 128-bit vector of [4 x float] containing one of the source operands. -/// The horizontal differences between the values are stored in the upper -/// bits of the destination. -/// \returns A 128-bit vector of [4 x float] containing the horizontal -/// differences of both operands. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_hsub_ps(__m128 __a, __m128 __b) -{ - return __builtin_ia32_hsubps((__v4sf)__a, (__v4sf)__b); -} - -/// Moves and duplicates odd-indexed values from a 128-bit vector -/// of [4 x float] to float values stored in a 128-bit vector of -/// [4 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVSHDUP instruction. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. \n -/// Bits [127:96] of the source are written to bits [127:96] and [95:64] of -/// the destination. \n -/// Bits [63:32] of the source are written to bits [63:32] and [31:0] of the -/// destination. -/// \returns A 128-bit vector of [4 x float] containing the moved and duplicated -/// values. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_movehdup_ps(__m128 __a) -{ - return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 1, 1, 3, 3); -} - -/// Duplicates even-indexed values from a 128-bit vector of -/// [4 x float] to float values stored in a 128-bit vector of [4 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVSLDUP instruction. -/// -/// \param __a -/// A 128-bit vector of [4 x float] \n -/// Bits [95:64] of the source are written to bits [127:96] and [95:64] of -/// the destination. \n -/// Bits [31:0] of the source are written to bits [63:32] and [31:0] of the -/// destination. -/// \returns A 128-bit vector of [4 x float] containing the moved and duplicated -/// values. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_moveldup_ps(__m128 __a) -{ - return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 2, 2); -} - -/// Adds the even-indexed values and subtracts the odd-indexed values of -/// two 128-bit vectors of [2 x double]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VADDSUBPD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double] containing the left source operand. -/// \param __b -/// A 128-bit vector of [2 x double] containing the right source operand. -/// \returns A 128-bit vector of [2 x double] containing the alternating sums -/// and differences of both operands. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_addsub_pd(__m128d __a, __m128d __b) -{ - return __builtin_ia32_addsubpd((__v2df)__a, (__v2df)__b); -} - -/// Horizontally adds the pairs of values contained in two 128-bit -/// vectors of [2 x double]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VHADDPD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double] containing one of the source operands. -/// The horizontal sum of the values is stored in the lower bits of the -/// destination. -/// \param __b -/// A 128-bit vector of [2 x double] containing one of the source operands. -/// The horizontal sum of the values is stored in the upper bits of the -/// destination. -/// \returns A 128-bit vector of [2 x double] containing the horizontal sums of -/// both operands. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_hadd_pd(__m128d __a, __m128d __b) -{ - return __builtin_ia32_haddpd((__v2df)__a, (__v2df)__b); -} - -/// Horizontally subtracts the pairs of values contained in two 128-bit -/// vectors of [2 x double]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VHSUBPD instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double] containing one of the source operands. -/// The horizontal difference of the values is stored in the lower bits of -/// the destination. -/// \param __b -/// A 128-bit vector of [2 x double] containing one of the source operands. -/// The horizontal difference of the values is stored in the upper bits of -/// the destination. -/// \returns A 128-bit vector of [2 x double] containing the horizontal -/// differences of both operands. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_hsub_pd(__m128d __a, __m128d __b) -{ - return __builtin_ia32_hsubpd((__v2df)__a, (__v2df)__b); -} - -/// Moves and duplicates one double-precision value to double-precision -/// values stored in a 128-bit vector of [2 x double]. -/// -/// \headerfile -/// -/// \code -/// __m128d _mm_loaddup_pd(double const *dp); -/// \endcode -/// -/// This intrinsic corresponds to the VMOVDDUP instruction. -/// -/// \param dp -/// A pointer to a double-precision value to be moved and duplicated. -/// \returns A 128-bit vector of [2 x double] containing the moved and -/// duplicated values. -#define _mm_loaddup_pd(dp) _mm_load1_pd(dp) - -/// Moves and duplicates the double-precision value in the lower bits of -/// a 128-bit vector of [2 x double] to double-precision values stored in a -/// 128-bit vector of [2 x double]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVDDUP instruction. -/// -/// \param __a -/// A 128-bit vector of [2 x double]. Bits [63:0] are written to bits -/// [127:64] and [63:0] of the destination. -/// \returns A 128-bit vector of [2 x double] containing the moved and -/// duplicated values. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_movedup_pd(__m128d __a) -{ - return __builtin_shufflevector((__v2df)__a, (__v2df)__a, 0, 0); -} - -/// Establishes a linear address memory range to be monitored and puts -/// the processor in the monitor event pending state. Data stored in the -/// monitored address range causes the processor to exit the pending state. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the MONITOR instruction. -/// -/// \param __p -/// The memory range to be monitored. The size of the range is determined by -/// CPUID function 0000_0005h. -/// \param __extensions -/// Optional extensions for the monitoring state. -/// \param __hints -/// Optional hints for the monitoring state. -static __inline__ void __DEFAULT_FN_ATTRS -_mm_monitor(void const *__p, unsigned __extensions, unsigned __hints) -{ - __builtin_ia32_monitor(__p, __extensions, __hints); -} - -/// Used with the MONITOR instruction to wait while the processor is in -/// the monitor event pending state. Data stored in the monitored address -/// range causes the processor to exit the pending state. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the MWAIT instruction. -/// -/// \param __extensions -/// Optional extensions for the monitoring state, which may vary by -/// processor. -/// \param __hints -/// Optional hints for the monitoring state, which may vary by processor. -static __inline__ void __DEFAULT_FN_ATTRS -_mm_mwait(unsigned __extensions, unsigned __hints) -{ - __builtin_ia32_mwait(__extensions, __hints); -} - -#undef __DEFAULT_FN_ATTRS - -#endif /* __PMMINTRIN_H */ diff --git a/include/popcntintrin.h b/include/popcntintrin.h deleted file mode 100644 index 0aa94ae..0000000 --- a/include/popcntintrin.h +++ /dev/null @@ -1,59 +0,0 @@ -/*===---- popcntintrin.h - POPCNT intrinsics -------------------------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ - -#ifndef __POPCNTINTRIN_H -#define __POPCNTINTRIN_H - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("popcnt"))) - -#if defined(__cplusplus) && (__cplusplus >= 201103L) -#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr -#else -#define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS -#endif - -/// Counts the number of bits in the source operand having a value of 1. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the POPCNT instruction. -/// -/// \param __A -/// An unsigned 32-bit integer operand. -/// \returns A 32-bit integer containing the number of bits with value 1 in the -/// source operand. -static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR -_mm_popcnt_u32(unsigned int __A) -{ - return __builtin_popcount(__A); -} - -#ifdef __x86_64__ -/// Counts the number of bits in the source operand having a value of 1. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the POPCNT instruction. -/// -/// \param __A -/// An unsigned 64-bit integer operand. -/// \returns A 64-bit integer containing the number of bits with value 1 in the -/// source operand. -static __inline__ long long __DEFAULT_FN_ATTRS_CONSTEXPR -_mm_popcnt_u64(unsigned long long __A) -{ - return __builtin_popcountll(__A); -} -#endif /* __x86_64__ */ - -#undef __DEFAULT_FN_ATTRS -#undef __DEFAULT_FN_ATTRS_CONSTEXPR - -#endif /* __POPCNTINTRIN_H */ diff --git a/include/prfchwintrin.h b/include/prfchwintrin.h deleted file mode 100644 index d2f91aa..0000000 --- a/include/prfchwintrin.h +++ /dev/null @@ -1,58 +0,0 @@ -/*===---- prfchwintrin.h - PREFETCHW intrinsic -----------------------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ - -#if !defined(__X86INTRIN_H) && !defined(_MM3DNOW_H_INCLUDED) -#error "Never use directly; include or instead." -#endif - -#ifndef __PRFCHWINTRIN_H -#define __PRFCHWINTRIN_H - -/// Loads a memory sequence containing the specified memory address into -/// all data cache levels. The cache-coherency state is set to exclusive. -/// Data can be read from and written to the cache line without additional -/// delay. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the \c PREFETCHT0 instruction. -/// -/// \param __P -/// A pointer specifying the memory address to be prefetched. -static __inline__ void __attribute__((__always_inline__, __nodebug__)) -_m_prefetch(void *__P) -{ - __builtin_prefetch (__P, 0, 3 /* _MM_HINT_T0 */); -} - -/// Loads a memory sequence containing the specified memory address into -/// the L1 data cache and sets the cache-coherency to modified. This -/// provides a hint to the processor that the cache line will be modified. -/// It is intended for use when the cache line will be written to shortly -/// after the prefetch is performed. -/// -/// Note that the effect of this intrinsic is dependent on the processor -/// implementation. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the \c PREFETCHW instruction. -/// -/// \param __P -/// A pointer specifying the memory address to be prefetched. -static __inline__ void __attribute__((__always_inline__, __nodebug__)) -_m_prefetchw(volatile const void *__P) -{ -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wcast-qual" - __builtin_prefetch ((const void*)__P, 1, 3 /* _MM_HINT_T0 */); -#pragma clang diagnostic pop -} - -#endif /* __PRFCHWINTRIN_H */ diff --git a/include/ptwriteintrin.h b/include/ptwriteintrin.h deleted file mode 100644 index 0a04f7c..0000000 --- a/include/ptwriteintrin.h +++ /dev/null @@ -1,37 +0,0 @@ -/*===------------ ptwriteintrin.h - PTWRITE intrinsic --------------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ - -#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __PTWRITEINTRIN_H -#define __PTWRITEINTRIN_H - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS \ - __attribute__((__always_inline__, __nodebug__, __target__("ptwrite"))) - -static __inline__ void __DEFAULT_FN_ATTRS -_ptwrite32(unsigned int __value) { - __builtin_ia32_ptwrite32(__value); -} - -#ifdef __x86_64__ - -static __inline__ void __DEFAULT_FN_ATTRS -_ptwrite64(unsigned long long __value) { - __builtin_ia32_ptwrite64(__value); -} - -#endif /* __x86_64__ */ - -#undef __DEFAULT_FN_ATTRS - -#endif /* __PTWRITEINTRIN_H */ diff --git a/include/rdseedintrin.h b/include/rdseedintrin.h deleted file mode 100644 index ccb3d2d..0000000 --- a/include/rdseedintrin.h +++ /dev/null @@ -1,42 +0,0 @@ -/*===---- rdseedintrin.h - RDSEED intrinsics -------------------------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ - -#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __RDSEEDINTRIN_H -#define __RDSEEDINTRIN_H - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("rdseed"))) - -static __inline__ int __DEFAULT_FN_ATTRS -_rdseed16_step(unsigned short *__p) -{ - return __builtin_ia32_rdseed16_step(__p); -} - -static __inline__ int __DEFAULT_FN_ATTRS -_rdseed32_step(unsigned int *__p) -{ - return __builtin_ia32_rdseed32_step(__p); -} - -#ifdef __x86_64__ -static __inline__ int __DEFAULT_FN_ATTRS -_rdseed64_step(unsigned long long *__p) -{ - return __builtin_ia32_rdseed64_step(__p); -} -#endif - -#undef __DEFAULT_FN_ATTRS - -#endif /* __RDSEEDINTRIN_H */ diff --git a/include/rtmintrin.h b/include/rtmintrin.h deleted file mode 100644 index 36ff583..0000000 --- a/include/rtmintrin.h +++ /dev/null @@ -1,45 +0,0 @@ -/*===---- rtmintrin.h - RTM intrinsics -------------------------------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ - -#ifndef __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __RTMINTRIN_H -#define __RTMINTRIN_H - -#define _XBEGIN_STARTED (~0u) -#define _XABORT_EXPLICIT (1 << 0) -#define _XABORT_RETRY (1 << 1) -#define _XABORT_CONFLICT (1 << 2) -#define _XABORT_CAPACITY (1 << 3) -#define _XABORT_DEBUG (1 << 4) -#define _XABORT_NESTED (1 << 5) -#define _XABORT_CODE(x) (((x) >> 24) & 0xFF) - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("rtm"))) - -static __inline__ unsigned int __DEFAULT_FN_ATTRS -_xbegin(void) -{ - return __builtin_ia32_xbegin(); -} - -static __inline__ void __DEFAULT_FN_ATTRS -_xend(void) -{ - __builtin_ia32_xend(); -} - -#define _xabort(imm) __builtin_ia32_xabort((imm)) - -#undef __DEFAULT_FN_ATTRS - -#endif /* __RTMINTRIN_H */ diff --git a/include/serializeintrin.h b/include/serializeintrin.h deleted file mode 100644 index b774e5a..0000000 --- a/include/serializeintrin.h +++ /dev/null @@ -1,30 +0,0 @@ -/*===--------------- serializeintrin.h - serialize intrinsics --------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ - -#ifndef __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __SERIALIZEINTRIN_H -#define __SERIALIZEINTRIN_H - -/// Serialize instruction fetch and execution. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the SERIALIZE instruction. -/// -static __inline__ void -__attribute__((__always_inline__, __nodebug__, __target__("serialize"))) -_serialize (void) -{ - __builtin_ia32_serialize (); -} - -#endif /* __SERIALIZEINTRIN_H */ diff --git a/include/sgxintrin.h b/include/sgxintrin.h deleted file mode 100644 index 303a21f..0000000 --- a/include/sgxintrin.h +++ /dev/null @@ -1,60 +0,0 @@ -/*===---- sgxintrin.h - X86 SGX intrinsics configuration -------------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ - -#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __SGXINTRIN_H -#define __SGXINTRIN_H - -#if __has_extension(gnu_asm) - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS \ - __attribute__((__always_inline__, __nodebug__, __target__("sgx"))) - -static __inline unsigned int __DEFAULT_FN_ATTRS -_enclu_u32(unsigned int __leaf, __SIZE_TYPE__ __d[]) -{ - unsigned int __result; - __asm__ ("enclu" - : "=a" (__result), "=b" (__d[0]), "=c" (__d[1]), "=d" (__d[2]) - : "a" (__leaf), "b" (__d[0]), "c" (__d[1]), "d" (__d[2]) - : "cc"); - return __result; -} - -static __inline unsigned int __DEFAULT_FN_ATTRS -_encls_u32(unsigned int __leaf, __SIZE_TYPE__ __d[]) -{ - unsigned int __result; - __asm__ ("encls" - : "=a" (__result), "=b" (__d[0]), "=c" (__d[1]), "=d" (__d[2]) - : "a" (__leaf), "b" (__d[0]), "c" (__d[1]), "d" (__d[2]) - : "cc"); - return __result; -} - -static __inline unsigned int __DEFAULT_FN_ATTRS -_enclv_u32(unsigned int __leaf, __SIZE_TYPE__ __d[]) -{ - unsigned int __result; - __asm__ ("enclv" - : "=a" (__result), "=b" (__d[0]), "=c" (__d[1]), "=d" (__d[2]) - : "a" (__leaf), "b" (__d[0]), "c" (__d[1]), "d" (__d[2]) - : "cc"); - return __result; -} - -#undef __DEFAULT_FN_ATTRS - -#endif /* __has_extension(gnu_asm) */ - -#endif diff --git a/include/shaintrin.h b/include/shaintrin.h deleted file mode 100644 index 08b1fb1..0000000 --- a/include/shaintrin.h +++ /dev/null @@ -1,61 +0,0 @@ -/*===---- shaintrin.h - SHA intrinsics -------------------------------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ - -#ifndef __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __SHAINTRIN_H -#define __SHAINTRIN_H - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sha"), __min_vector_width__(128))) - -#define _mm_sha1rnds4_epu32(V1, V2, M) \ - __builtin_ia32_sha1rnds4((__v4si)(__m128i)(V1), (__v4si)(__m128i)(V2), (M)) - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_sha1nexte_epu32(__m128i __X, __m128i __Y) -{ - return (__m128i)__builtin_ia32_sha1nexte((__v4si)__X, (__v4si)__Y); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_sha1msg1_epu32(__m128i __X, __m128i __Y) -{ - return (__m128i)__builtin_ia32_sha1msg1((__v4si)__X, (__v4si)__Y); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_sha1msg2_epu32(__m128i __X, __m128i __Y) -{ - return (__m128i)__builtin_ia32_sha1msg2((__v4si)__X, (__v4si)__Y); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_sha256rnds2_epu32(__m128i __X, __m128i __Y, __m128i __Z) -{ - return (__m128i)__builtin_ia32_sha256rnds2((__v4si)__X, (__v4si)__Y, (__v4si)__Z); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_sha256msg1_epu32(__m128i __X, __m128i __Y) -{ - return (__m128i)__builtin_ia32_sha256msg1((__v4si)__X, (__v4si)__Y); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_sha256msg2_epu32(__m128i __X, __m128i __Y) -{ - return (__m128i)__builtin_ia32_sha256msg2((__v4si)__X, (__v4si)__Y); -} - -#undef __DEFAULT_FN_ATTRS - -#endif /* __SHAINTRIN_H */ diff --git a/include/smmintrin.h b/include/smmintrin.h deleted file mode 100644 index aff83ee..0000000 --- a/include/smmintrin.h +++ /dev/null @@ -1,2383 +0,0 @@ -/*===---- smmintrin.h - SSE4 intrinsics ------------------------------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ - -#ifndef __SMMINTRIN_H -#define __SMMINTRIN_H - -#if !defined(__i386__) && !defined(__x86_64__) -#error "This header is only meant to be used on x86 and x64 architecture" -#endif - -#include - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.1"), __min_vector_width__(128))) - -/* SSE4 Rounding macros. */ -#define _MM_FROUND_TO_NEAREST_INT 0x00 -#define _MM_FROUND_TO_NEG_INF 0x01 -#define _MM_FROUND_TO_POS_INF 0x02 -#define _MM_FROUND_TO_ZERO 0x03 -#define _MM_FROUND_CUR_DIRECTION 0x04 - -#define _MM_FROUND_RAISE_EXC 0x00 -#define _MM_FROUND_NO_EXC 0x08 - -#define _MM_FROUND_NINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEAREST_INT) -#define _MM_FROUND_FLOOR (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_NEG_INF) -#define _MM_FROUND_CEIL (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_POS_INF) -#define _MM_FROUND_TRUNC (_MM_FROUND_RAISE_EXC | _MM_FROUND_TO_ZERO) -#define _MM_FROUND_RINT (_MM_FROUND_RAISE_EXC | _MM_FROUND_CUR_DIRECTION) -#define _MM_FROUND_NEARBYINT (_MM_FROUND_NO_EXC | _MM_FROUND_CUR_DIRECTION) - -/// Rounds up each element of the 128-bit vector of [4 x float] to an -/// integer and returns the rounded values in a 128-bit vector of -/// [4 x float]. -/// -/// \headerfile -/// -/// \code -/// __m128 _mm_ceil_ps(__m128 X); -/// \endcode -/// -/// This intrinsic corresponds to the VROUNDPS / ROUNDPS instruction. -/// -/// \param X -/// A 128-bit vector of [4 x float] values to be rounded up. -/// \returns A 128-bit vector of [4 x float] containing the rounded values. -#define _mm_ceil_ps(X) _mm_round_ps((X), _MM_FROUND_CEIL) - -/// Rounds up each element of the 128-bit vector of [2 x double] to an -/// integer and returns the rounded values in a 128-bit vector of -/// [2 x double]. -/// -/// \headerfile -/// -/// \code -/// __m128d _mm_ceil_pd(__m128d X); -/// \endcode -/// -/// This intrinsic corresponds to the VROUNDPD / ROUNDPD instruction. -/// -/// \param X -/// A 128-bit vector of [2 x double] values to be rounded up. -/// \returns A 128-bit vector of [2 x double] containing the rounded values. -#define _mm_ceil_pd(X) _mm_round_pd((X), _MM_FROUND_CEIL) - -/// Copies three upper elements of the first 128-bit vector operand to -/// the corresponding three upper elements of the 128-bit result vector of -/// [4 x float]. Rounds up the lowest element of the second 128-bit vector -/// operand to an integer and copies it to the lowest element of the 128-bit -/// result vector of [4 x float]. -/// -/// \headerfile -/// -/// \code -/// __m128 _mm_ceil_ss(__m128 X, __m128 Y); -/// \endcode -/// -/// This intrinsic corresponds to the VROUNDSS / ROUNDSS instruction. -/// -/// \param X -/// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are -/// copied to the corresponding bits of the result. -/// \param Y -/// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is -/// rounded up to the nearest integer and copied to the corresponding bits -/// of the result. -/// \returns A 128-bit vector of [4 x float] containing the copied and rounded -/// values. -#define _mm_ceil_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_CEIL) - -/// Copies the upper element of the first 128-bit vector operand to the -/// corresponding upper element of the 128-bit result vector of [2 x double]. -/// Rounds up the lower element of the second 128-bit vector operand to an -/// integer and copies it to the lower element of the 128-bit result vector -/// of [2 x double]. -/// -/// \headerfile -/// -/// \code -/// __m128d _mm_ceil_sd(__m128d X, __m128d Y); -/// \endcode -/// -/// This intrinsic corresponds to the VROUNDSD / ROUNDSD instruction. -/// -/// \param X -/// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is -/// copied to the corresponding bits of the result. -/// \param Y -/// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is -/// rounded up to the nearest integer and copied to the corresponding bits -/// of the result. -/// \returns A 128-bit vector of [2 x double] containing the copied and rounded -/// values. -#define _mm_ceil_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_CEIL) - -/// Rounds down each element of the 128-bit vector of [4 x float] to an -/// an integer and returns the rounded values in a 128-bit vector of -/// [4 x float]. -/// -/// \headerfile -/// -/// \code -/// __m128 _mm_floor_ps(__m128 X); -/// \endcode -/// -/// This intrinsic corresponds to the VROUNDPS / ROUNDPS instruction. -/// -/// \param X -/// A 128-bit vector of [4 x float] values to be rounded down. -/// \returns A 128-bit vector of [4 x float] containing the rounded values. -#define _mm_floor_ps(X) _mm_round_ps((X), _MM_FROUND_FLOOR) - -/// Rounds down each element of the 128-bit vector of [2 x double] to an -/// integer and returns the rounded values in a 128-bit vector of -/// [2 x double]. -/// -/// \headerfile -/// -/// \code -/// __m128d _mm_floor_pd(__m128d X); -/// \endcode -/// -/// This intrinsic corresponds to the VROUNDPD / ROUNDPD instruction. -/// -/// \param X -/// A 128-bit vector of [2 x double]. -/// \returns A 128-bit vector of [2 x double] containing the rounded values. -#define _mm_floor_pd(X) _mm_round_pd((X), _MM_FROUND_FLOOR) - -/// Copies three upper elements of the first 128-bit vector operand to -/// the corresponding three upper elements of the 128-bit result vector of -/// [4 x float]. Rounds down the lowest element of the second 128-bit vector -/// operand to an integer and copies it to the lowest element of the 128-bit -/// result vector of [4 x float]. -/// -/// \headerfile -/// -/// \code -/// __m128 _mm_floor_ss(__m128 X, __m128 Y); -/// \endcode -/// -/// This intrinsic corresponds to the VROUNDSS / ROUNDSS instruction. -/// -/// \param X -/// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are -/// copied to the corresponding bits of the result. -/// \param Y -/// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is -/// rounded down to the nearest integer and copied to the corresponding bits -/// of the result. -/// \returns A 128-bit vector of [4 x float] containing the copied and rounded -/// values. -#define _mm_floor_ss(X, Y) _mm_round_ss((X), (Y), _MM_FROUND_FLOOR) - -/// Copies the upper element of the first 128-bit vector operand to the -/// corresponding upper element of the 128-bit result vector of [2 x double]. -/// Rounds down the lower element of the second 128-bit vector operand to an -/// integer and copies it to the lower element of the 128-bit result vector -/// of [2 x double]. -/// -/// \headerfile -/// -/// \code -/// __m128d _mm_floor_sd(__m128d X, __m128d Y); -/// \endcode -/// -/// This intrinsic corresponds to the VROUNDSD / ROUNDSD instruction. -/// -/// \param X -/// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is -/// copied to the corresponding bits of the result. -/// \param Y -/// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is -/// rounded down to the nearest integer and copied to the corresponding bits -/// of the result. -/// \returns A 128-bit vector of [2 x double] containing the copied and rounded -/// values. -#define _mm_floor_sd(X, Y) _mm_round_sd((X), (Y), _MM_FROUND_FLOOR) - -/// Rounds each element of the 128-bit vector of [4 x float] to an -/// integer value according to the rounding control specified by the second -/// argument and returns the rounded values in a 128-bit vector of -/// [4 x float]. -/// -/// \headerfile -/// -/// \code -/// __m128 _mm_round_ps(__m128 X, const int M); -/// \endcode -/// -/// This intrinsic corresponds to the VROUNDPS / ROUNDPS instruction. -/// -/// \param X -/// A 128-bit vector of [4 x float]. -/// \param M -/// An integer value that specifies the rounding operation. \n -/// Bits [7:4] are reserved. \n -/// Bit [3] is a precision exception value: \n -/// 0: A normal PE exception is used \n -/// 1: The PE field is not updated \n -/// Bit [2] is the rounding control source: \n -/// 0: Use bits [1:0] of \a M \n -/// 1: Use the current MXCSR setting \n -/// Bits [1:0] contain the rounding control definition: \n -/// 00: Nearest \n -/// 01: Downward (toward negative infinity) \n -/// 10: Upward (toward positive infinity) \n -/// 11: Truncated -/// \returns A 128-bit vector of [4 x float] containing the rounded values. -#define _mm_round_ps(X, M) \ - ((__m128)__builtin_ia32_roundps((__v4sf)(__m128)(X), (M))) - -/// Copies three upper elements of the first 128-bit vector operand to -/// the corresponding three upper elements of the 128-bit result vector of -/// [4 x float]. Rounds the lowest element of the second 128-bit vector -/// operand to an integer value according to the rounding control specified -/// by the third argument and copies it to the lowest element of the 128-bit -/// result vector of [4 x float]. -/// -/// \headerfile -/// -/// \code -/// __m128 _mm_round_ss(__m128 X, __m128 Y, const int M); -/// \endcode -/// -/// This intrinsic corresponds to the VROUNDSS / ROUNDSS instruction. -/// -/// \param X -/// A 128-bit vector of [4 x float]. The values stored in bits [127:32] are -/// copied to the corresponding bits of the result. -/// \param Y -/// A 128-bit vector of [4 x float]. The value stored in bits [31:0] is -/// rounded to the nearest integer using the specified rounding control and -/// copied to the corresponding bits of the result. -/// \param M -/// An integer value that specifies the rounding operation. \n -/// Bits [7:4] are reserved. \n -/// Bit [3] is a precision exception value: \n -/// 0: A normal PE exception is used \n -/// 1: The PE field is not updated \n -/// Bit [2] is the rounding control source: \n -/// 0: Use bits [1:0] of \a M \n -/// 1: Use the current MXCSR setting \n -/// Bits [1:0] contain the rounding control definition: \n -/// 00: Nearest \n -/// 01: Downward (toward negative infinity) \n -/// 10: Upward (toward positive infinity) \n -/// 11: Truncated -/// \returns A 128-bit vector of [4 x float] containing the copied and rounded -/// values. -#define _mm_round_ss(X, Y, M) \ - ((__m128)__builtin_ia32_roundss((__v4sf)(__m128)(X), \ - (__v4sf)(__m128)(Y), (M))) - -/// Rounds each element of the 128-bit vector of [2 x double] to an -/// integer value according to the rounding control specified by the second -/// argument and returns the rounded values in a 128-bit vector of -/// [2 x double]. -/// -/// \headerfile -/// -/// \code -/// __m128d _mm_round_pd(__m128d X, const int M); -/// \endcode -/// -/// This intrinsic corresponds to the VROUNDPD / ROUNDPD instruction. -/// -/// \param X -/// A 128-bit vector of [2 x double]. -/// \param M -/// An integer value that specifies the rounding operation. \n -/// Bits [7:4] are reserved. \n -/// Bit [3] is a precision exception value: \n -/// 0: A normal PE exception is used \n -/// 1: The PE field is not updated \n -/// Bit [2] is the rounding control source: \n -/// 0: Use bits [1:0] of \a M \n -/// 1: Use the current MXCSR setting \n -/// Bits [1:0] contain the rounding control definition: \n -/// 00: Nearest \n -/// 01: Downward (toward negative infinity) \n -/// 10: Upward (toward positive infinity) \n -/// 11: Truncated -/// \returns A 128-bit vector of [2 x double] containing the rounded values. -#define _mm_round_pd(X, M) \ - ((__m128d)__builtin_ia32_roundpd((__v2df)(__m128d)(X), (M))) - -/// Copies the upper element of the first 128-bit vector operand to the -/// corresponding upper element of the 128-bit result vector of [2 x double]. -/// Rounds the lower element of the second 128-bit vector operand to an -/// integer value according to the rounding control specified by the third -/// argument and copies it to the lower element of the 128-bit result vector -/// of [2 x double]. -/// -/// \headerfile -/// -/// \code -/// __m128d _mm_round_sd(__m128d X, __m128d Y, const int M); -/// \endcode -/// -/// This intrinsic corresponds to the VROUNDSD / ROUNDSD instruction. -/// -/// \param X -/// A 128-bit vector of [2 x double]. The value stored in bits [127:64] is -/// copied to the corresponding bits of the result. -/// \param Y -/// A 128-bit vector of [2 x double]. The value stored in bits [63:0] is -/// rounded to the nearest integer using the specified rounding control and -/// copied to the corresponding bits of the result. -/// \param M -/// An integer value that specifies the rounding operation. \n -/// Bits [7:4] are reserved. \n -/// Bit [3] is a precision exception value: \n -/// 0: A normal PE exception is used \n -/// 1: The PE field is not updated \n -/// Bit [2] is the rounding control source: \n -/// 0: Use bits [1:0] of \a M \n -/// 1: Use the current MXCSR setting \n -/// Bits [1:0] contain the rounding control definition: \n -/// 00: Nearest \n -/// 01: Downward (toward negative infinity) \n -/// 10: Upward (toward positive infinity) \n -/// 11: Truncated -/// \returns A 128-bit vector of [2 x double] containing the copied and rounded -/// values. -#define _mm_round_sd(X, Y, M) \ - ((__m128d)__builtin_ia32_roundsd((__v2df)(__m128d)(X), \ - (__v2df)(__m128d)(Y), (M))) - -/* SSE4 Packed Blending Intrinsics. */ -/// Returns a 128-bit vector of [2 x double] where the values are -/// selected from either the first or second operand as specified by the -/// third operand, the control mask. -/// -/// \headerfile -/// -/// \code -/// __m128d _mm_blend_pd(__m128d V1, __m128d V2, const int M); -/// \endcode -/// -/// This intrinsic corresponds to the VBLENDPD / BLENDPD instruction. -/// -/// \param V1 -/// A 128-bit vector of [2 x double]. -/// \param V2 -/// A 128-bit vector of [2 x double]. -/// \param M -/// An immediate integer operand, with mask bits [1:0] specifying how the -/// values are to be copied. The position of the mask bit corresponds to the -/// index of a copied value. When a mask bit is 0, the corresponding 64-bit -/// element in operand \a V1 is copied to the same position in the result. -/// When a mask bit is 1, the corresponding 64-bit element in operand \a V2 -/// is copied to the same position in the result. -/// \returns A 128-bit vector of [2 x double] containing the copied values. -#define _mm_blend_pd(V1, V2, M) \ - ((__m128d) __builtin_ia32_blendpd ((__v2df)(__m128d)(V1), \ - (__v2df)(__m128d)(V2), (int)(M))) - -/// Returns a 128-bit vector of [4 x float] where the values are selected -/// from either the first or second operand as specified by the third -/// operand, the control mask. -/// -/// \headerfile -/// -/// \code -/// __m128 _mm_blend_ps(__m128 V1, __m128 V2, const int M); -/// \endcode -/// -/// This intrinsic corresponds to the VBLENDPS / BLENDPS instruction. -/// -/// \param V1 -/// A 128-bit vector of [4 x float]. -/// \param V2 -/// A 128-bit vector of [4 x float]. -/// \param M -/// An immediate integer operand, with mask bits [3:0] specifying how the -/// values are to be copied. The position of the mask bit corresponds to the -/// index of a copied value. When a mask bit is 0, the corresponding 32-bit -/// element in operand \a V1 is copied to the same position in the result. -/// When a mask bit is 1, the corresponding 32-bit element in operand \a V2 -/// is copied to the same position in the result. -/// \returns A 128-bit vector of [4 x float] containing the copied values. -#define _mm_blend_ps(V1, V2, M) \ - ((__m128) __builtin_ia32_blendps ((__v4sf)(__m128)(V1), \ - (__v4sf)(__m128)(V2), (int)(M))) - -/// Returns a 128-bit vector of [2 x double] where the values are -/// selected from either the first or second operand as specified by the -/// third operand, the control mask. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VBLENDVPD / BLENDVPD instruction. -/// -/// \param __V1 -/// A 128-bit vector of [2 x double]. -/// \param __V2 -/// A 128-bit vector of [2 x double]. -/// \param __M -/// A 128-bit vector operand, with mask bits 127 and 63 specifying how the -/// values are to be copied. The position of the mask bit corresponds to the -/// most significant bit of a copied value. When a mask bit is 0, the -/// corresponding 64-bit element in operand \a __V1 is copied to the same -/// position in the result. When a mask bit is 1, the corresponding 64-bit -/// element in operand \a __V2 is copied to the same position in the result. -/// \returns A 128-bit vector of [2 x double] containing the copied values. -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_blendv_pd (__m128d __V1, __m128d __V2, __m128d __M) -{ - return (__m128d) __builtin_ia32_blendvpd ((__v2df)__V1, (__v2df)__V2, - (__v2df)__M); -} - -/// Returns a 128-bit vector of [4 x float] where the values are -/// selected from either the first or second operand as specified by the -/// third operand, the control mask. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VBLENDVPS / BLENDVPS instruction. -/// -/// \param __V1 -/// A 128-bit vector of [4 x float]. -/// \param __V2 -/// A 128-bit vector of [4 x float]. -/// \param __M -/// A 128-bit vector operand, with mask bits 127, 95, 63, and 31 specifying -/// how the values are to be copied. The position of the mask bit corresponds -/// to the most significant bit of a copied value. When a mask bit is 0, the -/// corresponding 32-bit element in operand \a __V1 is copied to the same -/// position in the result. When a mask bit is 1, the corresponding 32-bit -/// element in operand \a __V2 is copied to the same position in the result. -/// \returns A 128-bit vector of [4 x float] containing the copied values. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_blendv_ps (__m128 __V1, __m128 __V2, __m128 __M) -{ - return (__m128) __builtin_ia32_blendvps ((__v4sf)__V1, (__v4sf)__V2, - (__v4sf)__M); -} - -/// Returns a 128-bit vector of [16 x i8] where the values are selected -/// from either of the first or second operand as specified by the third -/// operand, the control mask. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPBLENDVB / PBLENDVB instruction. -/// -/// \param __V1 -/// A 128-bit vector of [16 x i8]. -/// \param __V2 -/// A 128-bit vector of [16 x i8]. -/// \param __M -/// A 128-bit vector operand, with mask bits 127, 119, 111...7 specifying -/// how the values are to be copied. The position of the mask bit corresponds -/// to the most significant bit of a copied value. When a mask bit is 0, the -/// corresponding 8-bit element in operand \a __V1 is copied to the same -/// position in the result. When a mask bit is 1, the corresponding 8-bit -/// element in operand \a __V2 is copied to the same position in the result. -/// \returns A 128-bit vector of [16 x i8] containing the copied values. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_blendv_epi8 (__m128i __V1, __m128i __V2, __m128i __M) -{ - return (__m128i) __builtin_ia32_pblendvb128 ((__v16qi)__V1, (__v16qi)__V2, - (__v16qi)__M); -} - -/// Returns a 128-bit vector of [8 x i16] where the values are selected -/// from either of the first or second operand as specified by the third -/// operand, the control mask. -/// -/// \headerfile -/// -/// \code -/// __m128i _mm_blend_epi16(__m128i V1, __m128i V2, const int M); -/// \endcode -/// -/// This intrinsic corresponds to the VPBLENDW / PBLENDW instruction. -/// -/// \param V1 -/// A 128-bit vector of [8 x i16]. -/// \param V2 -/// A 128-bit vector of [8 x i16]. -/// \param M -/// An immediate integer operand, with mask bits [7:0] specifying how the -/// values are to be copied. The position of the mask bit corresponds to the -/// index of a copied value. When a mask bit is 0, the corresponding 16-bit -/// element in operand \a V1 is copied to the same position in the result. -/// When a mask bit is 1, the corresponding 16-bit element in operand \a V2 -/// is copied to the same position in the result. -/// \returns A 128-bit vector of [8 x i16] containing the copied values. -#define _mm_blend_epi16(V1, V2, M) \ - ((__m128i) __builtin_ia32_pblendw128 ((__v8hi)(__m128i)(V1), \ - (__v8hi)(__m128i)(V2), (int)(M))) - -/* SSE4 Dword Multiply Instructions. */ -/// Multiples corresponding elements of two 128-bit vectors of [4 x i32] -/// and returns the lower 32 bits of the each product in a 128-bit vector of -/// [4 x i32]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPMULLD / PMULLD instruction. -/// -/// \param __V1 -/// A 128-bit integer vector. -/// \param __V2 -/// A 128-bit integer vector. -/// \returns A 128-bit integer vector containing the products of both operands. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mullo_epi32 (__m128i __V1, __m128i __V2) -{ - return (__m128i) ((__v4su)__V1 * (__v4su)__V2); -} - -/// Multiplies corresponding even-indexed elements of two 128-bit -/// vectors of [4 x i32] and returns a 128-bit vector of [2 x i64] -/// containing the products. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPMULDQ / PMULDQ instruction. -/// -/// \param __V1 -/// A 128-bit vector of [4 x i32]. -/// \param __V2 -/// A 128-bit vector of [4 x i32]. -/// \returns A 128-bit vector of [2 x i64] containing the products of both -/// operands. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mul_epi32 (__m128i __V1, __m128i __V2) -{ - return (__m128i) __builtin_ia32_pmuldq128 ((__v4si)__V1, (__v4si)__V2); -} - -/* SSE4 Floating Point Dot Product Instructions. */ -/// Computes the dot product of the two 128-bit vectors of [4 x float] -/// and returns it in the elements of the 128-bit result vector of -/// [4 x float]. -/// -/// The immediate integer operand controls which input elements -/// will contribute to the dot product, and where the final results are -/// returned. -/// -/// \headerfile -/// -/// \code -/// __m128 _mm_dp_ps(__m128 X, __m128 Y, const int M); -/// \endcode -/// -/// This intrinsic corresponds to the VDPPS / DPPS instruction. -/// -/// \param X -/// A 128-bit vector of [4 x float]. -/// \param Y -/// A 128-bit vector of [4 x float]. -/// \param M -/// An immediate integer operand. Mask bits [7:4] determine which elements -/// of the input vectors are used, with bit [4] corresponding to the lowest -/// element and bit [7] corresponding to the highest element of each [4 x -/// float] vector. If a bit is set, the corresponding elements from the two -/// input vectors are used as an input for dot product; otherwise that input -/// is treated as zero. Bits [3:0] determine which elements of the result -/// will receive a copy of the final dot product, with bit [0] corresponding -/// to the lowest element and bit [3] corresponding to the highest element of -/// each [4 x float] subvector. If a bit is set, the dot product is returned -/// in the corresponding element; otherwise that element is set to zero. -/// \returns A 128-bit vector of [4 x float] containing the dot product. -#define _mm_dp_ps(X, Y, M) \ - ((__m128) __builtin_ia32_dpps((__v4sf)(__m128)(X), \ - (__v4sf)(__m128)(Y), (M))) - -/// Computes the dot product of the two 128-bit vectors of [2 x double] -/// and returns it in the elements of the 128-bit result vector of -/// [2 x double]. -/// -/// The immediate integer operand controls which input -/// elements will contribute to the dot product, and where the final results -/// are returned. -/// -/// \headerfile -/// -/// \code -/// __m128d _mm_dp_pd(__m128d X, __m128d Y, const int M); -/// \endcode -/// -/// This intrinsic corresponds to the VDPPD / DPPD instruction. -/// -/// \param X -/// A 128-bit vector of [2 x double]. -/// \param Y -/// A 128-bit vector of [2 x double]. -/// \param M -/// An immediate integer operand. Mask bits [5:4] determine which elements -/// of the input vectors are used, with bit [4] corresponding to the lowest -/// element and bit [5] corresponding to the highest element of each of [2 x -/// double] vector. If a bit is set, the corresponding elements from the two -/// input vectors are used as an input for dot product; otherwise that input -/// is treated as zero. Bits [1:0] determine which elements of the result -/// will receive a copy of the final dot product, with bit [0] corresponding -/// to the lowest element and bit [1] corresponding to the highest element of -/// each [2 x double] vector. If a bit is set, the dot product is returned in -/// the corresponding element; otherwise that element is set to zero. -#define _mm_dp_pd(X, Y, M) \ - ((__m128d) __builtin_ia32_dppd((__v2df)(__m128d)(X), \ - (__v2df)(__m128d)(Y), (M))) - -/* SSE4 Streaming Load Hint Instruction. */ -/// Loads integer values from a 128-bit aligned memory location to a -/// 128-bit integer vector. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVNTDQA / MOVNTDQA instruction. -/// -/// \param __V -/// A pointer to a 128-bit aligned memory location that contains the integer -/// values. -/// \returns A 128-bit integer vector containing the data stored at the -/// specified memory location. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_stream_load_si128 (__m128i const *__V) -{ - return (__m128i) __builtin_nontemporal_load ((const __v2di *) __V); -} - -/* SSE4 Packed Integer Min/Max Instructions. */ -/// Compares the corresponding elements of two 128-bit vectors of -/// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the lesser -/// of the two values. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPMINSB / PMINSB instruction. -/// -/// \param __V1 -/// A 128-bit vector of [16 x i8]. -/// \param __V2 -/// A 128-bit vector of [16 x i8] -/// \returns A 128-bit vector of [16 x i8] containing the lesser values. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_min_epi8 (__m128i __V1, __m128i __V2) -{ -#if (__clang_major__ < 14) - return (__m128i) __builtin_ia32_pminsb128 ((__v16qi) __V1, (__v16qi) __V2); -#else - return (__m128i) __builtin_elementwise_min((__v16qs) __V1, (__v16qs) __V2); -#endif -} - -/// Compares the corresponding elements of two 128-bit vectors of -/// [16 x i8] and returns a 128-bit vector of [16 x i8] containing the -/// greater value of the two. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPMAXSB / PMAXSB instruction. -/// -/// \param __V1 -/// A 128-bit vector of [16 x i8]. -/// \param __V2 -/// A 128-bit vector of [16 x i8]. -/// \returns A 128-bit vector of [16 x i8] containing the greater values. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_max_epi8 (__m128i __V1, __m128i __V2) -{ -#if (__clang_major__ < 14) - return (__m128i) __builtin_ia32_pmaxsb128 ((__v16qi) __V1, (__v16qi) __V2); -#else - return (__m128i) __builtin_elementwise_max((__v16qs) __V1, (__v16qs) __V2); -#endif -} - -/// Compares the corresponding elements of two 128-bit vectors of -/// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the lesser -/// value of the two. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPMINUW / PMINUW instruction. -/// -/// \param __V1 -/// A 128-bit vector of [8 x u16]. -/// \param __V2 -/// A 128-bit vector of [8 x u16]. -/// \returns A 128-bit vector of [8 x u16] containing the lesser values. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_min_epu16 (__m128i __V1, __m128i __V2) -{ -#if (__clang_major__ < 14) - return (__m128i) __builtin_ia32_pminuw128 ((__v8hi) __V1, (__v8hi) __V2); -#else - return (__m128i) __builtin_elementwise_min((__v8hu) __V1, (__v8hu) __V2); -#endif -} - -/// Compares the corresponding elements of two 128-bit vectors of -/// [8 x u16] and returns a 128-bit vector of [8 x u16] containing the -/// greater value of the two. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPMAXUW / PMAXUW instruction. -/// -/// \param __V1 -/// A 128-bit vector of [8 x u16]. -/// \param __V2 -/// A 128-bit vector of [8 x u16]. -/// \returns A 128-bit vector of [8 x u16] containing the greater values. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_max_epu16 (__m128i __V1, __m128i __V2) -{ -#if (__clang_major__ < 14) - return (__m128i) __builtin_ia32_pmaxuw128 ((__v8hi) __V1, (__v8hi) __V2); -#else - return (__m128i) __builtin_elementwise_max((__v8hu) __V1, (__v8hu) __V2); -#endif -} - -/// Compares the corresponding elements of two 128-bit vectors of -/// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the lesser -/// value of the two. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPMINSD / PMINSD instruction. -/// -/// \param __V1 -/// A 128-bit vector of [4 x i32]. -/// \param __V2 -/// A 128-bit vector of [4 x i32]. -/// \returns A 128-bit vector of [4 x i32] containing the lesser values. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_min_epi32 (__m128i __V1, __m128i __V2) -{ -#if (__clang_major__ < 14) - return (__m128i) __builtin_ia32_pminsd128 ((__v4si) __V1, (__v4si) __V2); -#else - return (__m128i) __builtin_elementwise_min((__v4si) __V1, (__v4si) __V2); -#endif -} - -/// Compares the corresponding elements of two 128-bit vectors of -/// [4 x i32] and returns a 128-bit vector of [4 x i32] containing the -/// greater value of the two. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPMAXSD / PMAXSD instruction. -/// -/// \param __V1 -/// A 128-bit vector of [4 x i32]. -/// \param __V2 -/// A 128-bit vector of [4 x i32]. -/// \returns A 128-bit vector of [4 x i32] containing the greater values. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_max_epi32 (__m128i __V1, __m128i __V2) -{ -#if (__clang_major__ < 14) - return (__m128i) __builtin_ia32_pmaxsd128 ((__v4si) __V1, (__v4si) __V2); -#else - return (__m128i) __builtin_elementwise_max((__v4si) __V1, (__v4si) __V2); -#endif -} - -/// Compares the corresponding elements of two 128-bit vectors of -/// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the lesser -/// value of the two. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPMINUD / PMINUD instruction. -/// -/// \param __V1 -/// A 128-bit vector of [4 x u32]. -/// \param __V2 -/// A 128-bit vector of [4 x u32]. -/// \returns A 128-bit vector of [4 x u32] containing the lesser values. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_min_epu32 (__m128i __V1, __m128i __V2) -{ -#if (__clang_major__ < 14) - return (__m128i) __builtin_ia32_pminud128((__v4si) __V1, (__v4si) __V2); -#else - return (__m128i) __builtin_elementwise_min((__v4su) __V1, (__v4su) __V2); -#endif -} - -/// Compares the corresponding elements of two 128-bit vectors of -/// [4 x u32] and returns a 128-bit vector of [4 x u32] containing the -/// greater value of the two. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPMAXUD / PMAXUD instruction. -/// -/// \param __V1 -/// A 128-bit vector of [4 x u32]. -/// \param __V2 -/// A 128-bit vector of [4 x u32]. -/// \returns A 128-bit vector of [4 x u32] containing the greater values. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_max_epu32 (__m128i __V1, __m128i __V2) -{ -#if (__clang_major__ < 14) - return (__m128i) __builtin_ia32_pmaxud128((__v4si) __V1, (__v4si) __V2); -#else - return (__m128i) __builtin_elementwise_max((__v4su) __V1, (__v4su) __V2); -#endif -} - -/* SSE4 Insertion and Extraction from XMM Register Instructions. */ -/// Takes the first argument \a X and inserts an element from the second -/// argument \a Y as selected by the third argument \a N. That result then -/// has elements zeroed out also as selected by the third argument \a N. The -/// resulting 128-bit vector of [4 x float] is then returned. -/// -/// \headerfile -/// -/// \code -/// __m128 _mm_insert_ps(__m128 X, __m128 Y, const int N); -/// \endcode -/// -/// This intrinsic corresponds to the VINSERTPS instruction. -/// -/// \param X -/// A 128-bit vector source operand of [4 x float]. With the exception of -/// those bits in the result copied from parameter \a Y and zeroed by bits -/// [3:0] of \a N, all bits from this parameter are copied to the result. -/// \param Y -/// A 128-bit vector source operand of [4 x float]. One single-precision -/// floating-point element from this source, as determined by the immediate -/// parameter, is copied to the result. -/// \param N -/// Specifies which bits from operand \a Y will be copied, which bits in the -/// result they will be be copied to, and which bits in the result will be -/// cleared. The following assignments are made: \n -/// Bits [7:6] specify the bits to copy from operand \a Y: \n -/// 00: Selects bits [31:0] from operand \a Y. \n -/// 01: Selects bits [63:32] from operand \a Y. \n -/// 10: Selects bits [95:64] from operand \a Y. \n -/// 11: Selects bits [127:96] from operand \a Y. \n -/// Bits [5:4] specify the bits in the result to which the selected bits -/// from operand \a Y are copied: \n -/// 00: Copies the selected bits from \a Y to result bits [31:0]. \n -/// 01: Copies the selected bits from \a Y to result bits [63:32]. \n -/// 10: Copies the selected bits from \a Y to result bits [95:64]. \n -/// 11: Copies the selected bits from \a Y to result bits [127:96]. \n -/// Bits[3:0]: If any of these bits are set, the corresponding result -/// element is cleared. -/// \returns A 128-bit vector of [4 x float] containing the copied -/// single-precision floating point elements from the operands. -#define _mm_insert_ps(X, Y, N) __builtin_ia32_insertps128((X), (Y), (N)) - -/// Extracts a 32-bit integer from a 128-bit vector of [4 x float] and -/// returns it, using the immediate value parameter \a N as a selector. -/// -/// \headerfile -/// -/// \code -/// int _mm_extract_ps(__m128 X, const int N); -/// \endcode -/// -/// This intrinsic corresponds to the VEXTRACTPS / EXTRACTPS -/// instruction. -/// -/// \param X -/// A 128-bit vector of [4 x float]. -/// \param N -/// An immediate value. Bits [1:0] determines which bits from the argument -/// \a X are extracted and returned: \n -/// 00: Bits [31:0] of parameter \a X are returned. \n -/// 01: Bits [63:32] of parameter \a X are returned. \n -/// 10: Bits [95:64] of parameter \a X are returned. \n -/// 11: Bits [127:96] of parameter \a X are returned. -/// \returns A 32-bit integer containing the extracted 32 bits of float data. -#define _mm_extract_ps(X, N) \ - __builtin_bit_cast(int, __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N))) - -/* Miscellaneous insert and extract macros. */ -/* Extract a single-precision float from X at index N into D. */ -#define _MM_EXTRACT_FLOAT(D, X, N) \ - do { (D) = __builtin_ia32_vec_ext_v4sf((__v4sf)(__m128)(X), (int)(N)); } while (0) - -/* Or together 2 sets of indexes (X and Y) with the zeroing bits (Z) to create - an index suitable for _mm_insert_ps. */ -#define _MM_MK_INSERTPS_NDX(X, Y, Z) (((X) << 6) | ((Y) << 4) | (Z)) - -/* Extract a float from X at index N into the first index of the return. */ -#define _MM_PICK_OUT_PS(X, N) _mm_insert_ps (_mm_setzero_ps(), (X), \ - _MM_MK_INSERTPS_NDX((N), 0, 0x0e)) - -/* Insert int into packed integer array at index. */ -/// Constructs a 128-bit vector of [16 x i8] by first making a copy of -/// the 128-bit integer vector parameter, and then inserting the lower 8 bits -/// of an integer parameter \a I into an offset specified by the immediate -/// value parameter \a N. -/// -/// \headerfile -/// -/// \code -/// __m128i _mm_insert_epi8(__m128i X, int I, const int N); -/// \endcode -/// -/// This intrinsic corresponds to the VPINSRB / PINSRB instruction. -/// -/// \param X -/// A 128-bit integer vector of [16 x i8]. This vector is copied to the -/// result and then one of the sixteen elements in the result vector is -/// replaced by the lower 8 bits of \a I. -/// \param I -/// An integer. The lower 8 bits of this operand are written to the result -/// beginning at the offset specified by \a N. -/// \param N -/// An immediate value. Bits [3:0] specify the bit offset in the result at -/// which the lower 8 bits of \a I are written. \n -/// 0000: Bits [7:0] of the result are used for insertion. \n -/// 0001: Bits [15:8] of the result are used for insertion. \n -/// 0010: Bits [23:16] of the result are used for insertion. \n -/// 0011: Bits [31:24] of the result are used for insertion. \n -/// 0100: Bits [39:32] of the result are used for insertion. \n -/// 0101: Bits [47:40] of the result are used for insertion. \n -/// 0110: Bits [55:48] of the result are used for insertion. \n -/// 0111: Bits [63:56] of the result are used for insertion. \n -/// 1000: Bits [71:64] of the result are used for insertion. \n -/// 1001: Bits [79:72] of the result are used for insertion. \n -/// 1010: Bits [87:80] of the result are used for insertion. \n -/// 1011: Bits [95:88] of the result are used for insertion. \n -/// 1100: Bits [103:96] of the result are used for insertion. \n -/// 1101: Bits [111:104] of the result are used for insertion. \n -/// 1110: Bits [119:112] of the result are used for insertion. \n -/// 1111: Bits [127:120] of the result are used for insertion. -/// \returns A 128-bit integer vector containing the constructed values. -#define _mm_insert_epi8(X, I, N) \ - ((__m128i)__builtin_ia32_vec_set_v16qi((__v16qi)(__m128i)(X), \ - (int)(I), (int)(N))) - -/// Constructs a 128-bit vector of [4 x i32] by first making a copy of -/// the 128-bit integer vector parameter, and then inserting the 32-bit -/// integer parameter \a I at the offset specified by the immediate value -/// parameter \a N. -/// -/// \headerfile -/// -/// \code -/// __m128i _mm_insert_epi32(__m128i X, int I, const int N); -/// \endcode -/// -/// This intrinsic corresponds to the VPINSRD / PINSRD instruction. -/// -/// \param X -/// A 128-bit integer vector of [4 x i32]. This vector is copied to the -/// result and then one of the four elements in the result vector is -/// replaced by \a I. -/// \param I -/// A 32-bit integer that is written to the result beginning at the offset -/// specified by \a N. -/// \param N -/// An immediate value. Bits [1:0] specify the bit offset in the result at -/// which the integer \a I is written. \n -/// 00: Bits [31:0] of the result are used for insertion. \n -/// 01: Bits [63:32] of the result are used for insertion. \n -/// 10: Bits [95:64] of the result are used for insertion. \n -/// 11: Bits [127:96] of the result are used for insertion. -/// \returns A 128-bit integer vector containing the constructed values. -#define _mm_insert_epi32(X, I, N) \ - ((__m128i)__builtin_ia32_vec_set_v4si((__v4si)(__m128i)(X), \ - (int)(I), (int)(N))) - -#ifdef __x86_64__ -/// Constructs a 128-bit vector of [2 x i64] by first making a copy of -/// the 128-bit integer vector parameter, and then inserting the 64-bit -/// integer parameter \a I, using the immediate value parameter \a N as an -/// insertion location selector. -/// -/// \headerfile -/// -/// \code -/// __m128i _mm_insert_epi64(__m128i X, long long I, const int N); -/// \endcode -/// -/// This intrinsic corresponds to the VPINSRQ / PINSRQ instruction. -/// -/// \param X -/// A 128-bit integer vector of [2 x i64]. This vector is copied to the -/// result and then one of the two elements in the result vector is replaced -/// by \a I. -/// \param I -/// A 64-bit integer that is written to the result beginning at the offset -/// specified by \a N. -/// \param N -/// An immediate value. Bit [0] specifies the bit offset in the result at -/// which the integer \a I is written. \n -/// 0: Bits [63:0] of the result are used for insertion. \n -/// 1: Bits [127:64] of the result are used for insertion. \n -/// \returns A 128-bit integer vector containing the constructed values. -#define _mm_insert_epi64(X, I, N) \ - ((__m128i)__builtin_ia32_vec_set_v2di((__v2di)(__m128i)(X), \ - (long long)(I), (int)(N))) -#endif /* __x86_64__ */ - -/* Extract int from packed integer array at index. This returns the element - * as a zero extended value, so it is unsigned. - */ -/// Extracts an 8-bit element from the 128-bit integer vector of -/// [16 x i8], using the immediate value parameter \a N as a selector. -/// -/// \headerfile -/// -/// \code -/// int _mm_extract_epi8(__m128i X, const int N); -/// \endcode -/// -/// This intrinsic corresponds to the VPEXTRB / PEXTRB instruction. -/// -/// \param X -/// A 128-bit integer vector. -/// \param N -/// An immediate value. Bits [3:0] specify which 8-bit vector element from -/// the argument \a X to extract and copy to the result. \n -/// 0000: Bits [7:0] of parameter \a X are extracted. \n -/// 0001: Bits [15:8] of the parameter \a X are extracted. \n -/// 0010: Bits [23:16] of the parameter \a X are extracted. \n -/// 0011: Bits [31:24] of the parameter \a X are extracted. \n -/// 0100: Bits [39:32] of the parameter \a X are extracted. \n -/// 0101: Bits [47:40] of the parameter \a X are extracted. \n -/// 0110: Bits [55:48] of the parameter \a X are extracted. \n -/// 0111: Bits [63:56] of the parameter \a X are extracted. \n -/// 1000: Bits [71:64] of the parameter \a X are extracted. \n -/// 1001: Bits [79:72] of the parameter \a X are extracted. \n -/// 1010: Bits [87:80] of the parameter \a X are extracted. \n -/// 1011: Bits [95:88] of the parameter \a X are extracted. \n -/// 1100: Bits [103:96] of the parameter \a X are extracted. \n -/// 1101: Bits [111:104] of the parameter \a X are extracted. \n -/// 1110: Bits [119:112] of the parameter \a X are extracted. \n -/// 1111: Bits [127:120] of the parameter \a X are extracted. -/// \returns An unsigned integer, whose lower 8 bits are selected from the -/// 128-bit integer vector parameter and the remaining bits are assigned -/// zeros. -#define _mm_extract_epi8(X, N) \ - ((int)(unsigned char)__builtin_ia32_vec_ext_v16qi((__v16qi)(__m128i)(X), \ - (int)(N))) - -/// Extracts a 32-bit element from the 128-bit integer vector of -/// [4 x i32], using the immediate value parameter \a N as a selector. -/// -/// \headerfile -/// -/// \code -/// int _mm_extract_epi32(__m128i X, const int N); -/// \endcode -/// -/// This intrinsic corresponds to the VPEXTRD / PEXTRD instruction. -/// -/// \param X -/// A 128-bit integer vector. -/// \param N -/// An immediate value. Bits [1:0] specify which 32-bit vector element from -/// the argument \a X to extract and copy to the result. \n -/// 00: Bits [31:0] of the parameter \a X are extracted. \n -/// 01: Bits [63:32] of the parameter \a X are extracted. \n -/// 10: Bits [95:64] of the parameter \a X are extracted. \n -/// 11: Bits [127:96] of the parameter \a X are exracted. -/// \returns An integer, whose lower 32 bits are selected from the 128-bit -/// integer vector parameter and the remaining bits are assigned zeros. -#define _mm_extract_epi32(X, N) \ - ((int)__builtin_ia32_vec_ext_v4si((__v4si)(__m128i)(X), (int)(N))) - -#ifdef __x86_64__ -/// Extracts a 64-bit element from the 128-bit integer vector of -/// [2 x i64], using the immediate value parameter \a N as a selector. -/// -/// \headerfile -/// -/// \code -/// long long _mm_extract_epi64(__m128i X, const int N); -/// \endcode -/// -/// This intrinsic corresponds to the VPEXTRQ / PEXTRQ instruction. -/// -/// \param X -/// A 128-bit integer vector. -/// \param N -/// An immediate value. Bit [0] specifies which 64-bit vector element from -/// the argument \a X to return. \n -/// 0: Bits [63:0] are returned. \n -/// 1: Bits [127:64] are returned. \n -/// \returns A 64-bit integer. -#define _mm_extract_epi64(X, N) \ - ((long long)__builtin_ia32_vec_ext_v2di((__v2di)(__m128i)(X), (int)(N))) -#endif /* __x86_64 */ - -/* SSE4 128-bit Packed Integer Comparisons. */ -/// Tests whether the specified bits in a 128-bit integer vector are all -/// zeros. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPTEST / PTEST instruction. -/// -/// \param __M -/// A 128-bit integer vector containing the bits to be tested. -/// \param __V -/// A 128-bit integer vector selecting which bits to test in operand \a __M. -/// \returns TRUE if the specified bits are all zeros; FALSE otherwise. -static __inline__ int __DEFAULT_FN_ATTRS -_mm_testz_si128(__m128i __M, __m128i __V) -{ - return __builtin_ia32_ptestz128((__v2di)__M, (__v2di)__V); -} - -/// Tests whether the specified bits in a 128-bit integer vector are all -/// ones. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPTEST / PTEST instruction. -/// -/// \param __M -/// A 128-bit integer vector containing the bits to be tested. -/// \param __V -/// A 128-bit integer vector selecting which bits to test in operand \a __M. -/// \returns TRUE if the specified bits are all ones; FALSE otherwise. -static __inline__ int __DEFAULT_FN_ATTRS -_mm_testc_si128(__m128i __M, __m128i __V) -{ - return __builtin_ia32_ptestc128((__v2di)__M, (__v2di)__V); -} - -/// Tests whether the specified bits in a 128-bit integer vector are -/// neither all zeros nor all ones. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPTEST / PTEST instruction. -/// -/// \param __M -/// A 128-bit integer vector containing the bits to be tested. -/// \param __V -/// A 128-bit integer vector selecting which bits to test in operand \a __M. -/// \returns TRUE if the specified bits are neither all zeros nor all ones; -/// FALSE otherwise. -static __inline__ int __DEFAULT_FN_ATTRS -_mm_testnzc_si128(__m128i __M, __m128i __V) -{ - return __builtin_ia32_ptestnzc128((__v2di)__M, (__v2di)__V); -} - -/// Tests whether the specified bits in a 128-bit integer vector are all -/// ones. -/// -/// \headerfile -/// -/// \code -/// int _mm_test_all_ones(__m128i V); -/// \endcode -/// -/// This intrinsic corresponds to the VPTEST / PTEST instruction. -/// -/// \param V -/// A 128-bit integer vector containing the bits to be tested. -/// \returns TRUE if the bits specified in the operand are all set to 1; FALSE -/// otherwise. -#define _mm_test_all_ones(V) _mm_testc_si128((V), _mm_cmpeq_epi32((V), (V))) - -/// Tests whether the specified bits in a 128-bit integer vector are -/// neither all zeros nor all ones. -/// -/// \headerfile -/// -/// \code -/// int _mm_test_mix_ones_zeros(__m128i M, __m128i V); -/// \endcode -/// -/// This intrinsic corresponds to the VPTEST / PTEST instruction. -/// -/// \param M -/// A 128-bit integer vector containing the bits to be tested. -/// \param V -/// A 128-bit integer vector selecting which bits to test in operand \a M. -/// \returns TRUE if the specified bits are neither all zeros nor all ones; -/// FALSE otherwise. -#define _mm_test_mix_ones_zeros(M, V) _mm_testnzc_si128((M), (V)) - -/// Tests whether the specified bits in a 128-bit integer vector are all -/// zeros. -/// -/// \headerfile -/// -/// \code -/// int _mm_test_all_zeros(__m128i M, __m128i V); -/// \endcode -/// -/// This intrinsic corresponds to the VPTEST / PTEST instruction. -/// -/// \param M -/// A 128-bit integer vector containing the bits to be tested. -/// \param V -/// A 128-bit integer vector selecting which bits to test in operand \a M. -/// \returns TRUE if the specified bits are all zeros; FALSE otherwise. -#define _mm_test_all_zeros(M, V) _mm_testz_si128 ((M), (V)) - -/* SSE4 64-bit Packed Integer Comparisons. */ -/// Compares each of the corresponding 64-bit values of the 128-bit -/// integer vectors for equality. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPCMPEQQ / PCMPEQQ instruction. -/// -/// \param __V1 -/// A 128-bit integer vector. -/// \param __V2 -/// A 128-bit integer vector. -/// \returns A 128-bit integer vector containing the comparison results. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_cmpeq_epi64(__m128i __V1, __m128i __V2) -{ - return (__m128i)((__v2di)__V1 == (__v2di)__V2); -} - -/* SSE4 Packed Integer Sign-Extension. */ -/// Sign-extends each of the lower eight 8-bit integer elements of a -/// 128-bit vector of [16 x i8] to 16-bit values and returns them in a -/// 128-bit vector of [8 x i16]. The upper eight elements of the input vector -/// are unused. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPMOVSXBW / PMOVSXBW instruction. -/// -/// \param __V -/// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are sign- -/// extended to 16-bit values. -/// \returns A 128-bit vector of [8 x i16] containing the sign-extended values. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_cvtepi8_epi16(__m128i __V) -{ - /* This function always performs a signed extension, but __v16qi is a char - which may be signed or unsigned, so use __v16qs. */ - return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi); -} - -/// Sign-extends each of the lower four 8-bit integer elements of a -/// 128-bit vector of [16 x i8] to 32-bit values and returns them in a -/// 128-bit vector of [4 x i32]. The upper twelve elements of the input -/// vector are unused. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPMOVSXBD / PMOVSXBD instruction. -/// -/// \param __V -/// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are -/// sign-extended to 32-bit values. -/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_cvtepi8_epi32(__m128i __V) -{ - /* This function always performs a signed extension, but __v16qi is a char - which may be signed or unsigned, so use __v16qs. */ - return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1, 2, 3), __v4si); -} - -/// Sign-extends each of the lower two 8-bit integer elements of a -/// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in -/// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input -/// vector are unused. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPMOVSXBQ / PMOVSXBQ instruction. -/// -/// \param __V -/// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are -/// sign-extended to 64-bit values. -/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_cvtepi8_epi64(__m128i __V) -{ - /* This function always performs a signed extension, but __v16qi is a char - which may be signed or unsigned, so use __v16qs. */ - return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qs)__V, (__v16qs)__V, 0, 1), __v2di); -} - -/// Sign-extends each of the lower four 16-bit integer elements of a -/// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in -/// a 128-bit vector of [4 x i32]. The upper four elements of the input -/// vector are unused. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPMOVSXWD / PMOVSXWD instruction. -/// -/// \param __V -/// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are -/// sign-extended to 32-bit values. -/// \returns A 128-bit vector of [4 x i32] containing the sign-extended values. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_cvtepi16_epi32(__m128i __V) -{ - return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1, 2, 3), __v4si); -} - -/// Sign-extends each of the lower two 16-bit integer elements of a -/// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in -/// a 128-bit vector of [2 x i64]. The upper six elements of the input -/// vector are unused. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPMOVSXWQ / PMOVSXWQ instruction. -/// -/// \param __V -/// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are -/// sign-extended to 64-bit values. -/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_cvtepi16_epi64(__m128i __V) -{ - return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hi)__V, (__v8hi)__V, 0, 1), __v2di); -} - -/// Sign-extends each of the lower two 32-bit integer elements of a -/// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in -/// a 128-bit vector of [2 x i64]. The upper two elements of the input vector -/// are unused. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPMOVSXDQ / PMOVSXDQ instruction. -/// -/// \param __V -/// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are -/// sign-extended to 64-bit values. -/// \returns A 128-bit vector of [2 x i64] containing the sign-extended values. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_cvtepi32_epi64(__m128i __V) -{ - return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4si)__V, (__v4si)__V, 0, 1), __v2di); -} - -/* SSE4 Packed Integer Zero-Extension. */ -/// Zero-extends each of the lower eight 8-bit integer elements of a -/// 128-bit vector of [16 x i8] to 16-bit values and returns them in a -/// 128-bit vector of [8 x i16]. The upper eight elements of the input vector -/// are unused. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPMOVZXBW / PMOVZXBW instruction. -/// -/// \param __V -/// A 128-bit vector of [16 x i8]. The lower eight 8-bit elements are -/// zero-extended to 16-bit values. -/// \returns A 128-bit vector of [8 x i16] containing the zero-extended values. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_cvtepu8_epi16(__m128i __V) -{ - return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3, 4, 5, 6, 7), __v8hi); -} - -/// Zero-extends each of the lower four 8-bit integer elements of a -/// 128-bit vector of [16 x i8] to 32-bit values and returns them in a -/// 128-bit vector of [4 x i32]. The upper twelve elements of the input -/// vector are unused. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPMOVZXBD / PMOVZXBD instruction. -/// -/// \param __V -/// A 128-bit vector of [16 x i8]. The lower four 8-bit elements are -/// zero-extended to 32-bit values. -/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_cvtepu8_epi32(__m128i __V) -{ - return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1, 2, 3), __v4si); -} - -/// Zero-extends each of the lower two 8-bit integer elements of a -/// 128-bit integer vector of [16 x i8] to 64-bit values and returns them in -/// a 128-bit vector of [2 x i64]. The upper fourteen elements of the input -/// vector are unused. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPMOVZXBQ / PMOVZXBQ instruction. -/// -/// \param __V -/// A 128-bit vector of [16 x i8]. The lower two 8-bit elements are -/// zero-extended to 64-bit values. -/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_cvtepu8_epi64(__m128i __V) -{ - return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v16qu)__V, (__v16qu)__V, 0, 1), __v2di); -} - -/// Zero-extends each of the lower four 16-bit integer elements of a -/// 128-bit integer vector of [8 x i16] to 32-bit values and returns them in -/// a 128-bit vector of [4 x i32]. The upper four elements of the input -/// vector are unused. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPMOVZXWD / PMOVZXWD instruction. -/// -/// \param __V -/// A 128-bit vector of [8 x i16]. The lower four 16-bit elements are -/// zero-extended to 32-bit values. -/// \returns A 128-bit vector of [4 x i32] containing the zero-extended values. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_cvtepu16_epi32(__m128i __V) -{ - return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1, 2, 3), __v4si); -} - -/// Zero-extends each of the lower two 16-bit integer elements of a -/// 128-bit integer vector of [8 x i16] to 64-bit values and returns them in -/// a 128-bit vector of [2 x i64]. The upper six elements of the input vector -/// are unused. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPMOVZXWQ / PMOVZXWQ instruction. -/// -/// \param __V -/// A 128-bit vector of [8 x i16]. The lower two 16-bit elements are -/// zero-extended to 64-bit values. -/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_cvtepu16_epi64(__m128i __V) -{ - return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v8hu)__V, (__v8hu)__V, 0, 1), __v2di); -} - -/// Zero-extends each of the lower two 32-bit integer elements of a -/// 128-bit integer vector of [4 x i32] to 64-bit values and returns them in -/// a 128-bit vector of [2 x i64]. The upper two elements of the input vector -/// are unused. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPMOVZXDQ / PMOVZXDQ instruction. -/// -/// \param __V -/// A 128-bit vector of [4 x i32]. The lower two 32-bit elements are -/// zero-extended to 64-bit values. -/// \returns A 128-bit vector of [2 x i64] containing the zero-extended values. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_cvtepu32_epi64(__m128i __V) -{ - return (__m128i)__builtin_convertvector(__builtin_shufflevector((__v4su)__V, (__v4su)__V, 0, 1), __v2di); -} - -/* SSE4 Pack with Unsigned Saturation. */ -/// Converts 32-bit signed integers from both 128-bit integer vector -/// operands into 16-bit unsigned integers, and returns the packed result. -/// Values greater than 0xFFFF are saturated to 0xFFFF. Values less than -/// 0x0000 are saturated to 0x0000. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPACKUSDW / PACKUSDW instruction. -/// -/// \param __V1 -/// A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a -/// signed integer and is converted to a 16-bit unsigned integer with -/// saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values -/// less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values -/// are written to the lower 64 bits of the result. -/// \param __V2 -/// A 128-bit vector of [4 x i32]. Each 32-bit element is treated as a -/// signed integer and is converted to a 16-bit unsigned integer with -/// saturation. Values greater than 0xFFFF are saturated to 0xFFFF. Values -/// less than 0x0000 are saturated to 0x0000. The converted [4 x i16] values -/// are written to the higher 64 bits of the result. -/// \returns A 128-bit vector of [8 x i16] containing the converted values. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_packus_epi32(__m128i __V1, __m128i __V2) -{ - return (__m128i) __builtin_ia32_packusdw128((__v4si)__V1, (__v4si)__V2); -} - -/* SSE4 Multiple Packed Sums of Absolute Difference. */ -/// Subtracts 8-bit unsigned integer values and computes the absolute -/// values of the differences to the corresponding bits in the destination. -/// Then sums of the absolute differences are returned according to the bit -/// fields in the immediate operand. -/// -/// \headerfile -/// -/// \code -/// __m128i _mm_mpsadbw_epu8(__m128i X, __m128i Y, const int M); -/// \endcode -/// -/// This intrinsic corresponds to the VMPSADBW / MPSADBW instruction. -/// -/// \param X -/// A 128-bit vector of [16 x i8]. -/// \param Y -/// A 128-bit vector of [16 x i8]. -/// \param M -/// An 8-bit immediate operand specifying how the absolute differences are to -/// be calculated, according to the following algorithm: -/// \code -/// // M2 represents bit 2 of the immediate operand -/// // M10 represents bits [1:0] of the immediate operand -/// i = M2 * 4; -/// j = M10 * 4; -/// for (k = 0; k < 8; k = k + 1) { -/// d0 = abs(X[i + k + 0] - Y[j + 0]); -/// d1 = abs(X[i + k + 1] - Y[j + 1]); -/// d2 = abs(X[i + k + 2] - Y[j + 2]); -/// d3 = abs(X[i + k + 3] - Y[j + 3]); -/// r[k] = d0 + d1 + d2 + d3; -/// } -/// \endcode -/// \returns A 128-bit integer vector containing the sums of the sets of -/// absolute differences between both operands. -#define _mm_mpsadbw_epu8(X, Y, M) \ - ((__m128i) __builtin_ia32_mpsadbw128((__v16qi)(__m128i)(X), \ - (__v16qi)(__m128i)(Y), (M))) - -/// Finds the minimum unsigned 16-bit element in the input 128-bit -/// vector of [8 x u16] and returns it and along with its index. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPHMINPOSUW / PHMINPOSUW -/// instruction. -/// -/// \param __V -/// A 128-bit vector of [8 x u16]. -/// \returns A 128-bit value where bits [15:0] contain the minimum value found -/// in parameter \a __V, bits [18:16] contain the index of the minimum value -/// and the remaining bits are set to 0. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_minpos_epu16(__m128i __V) -{ - return (__m128i) __builtin_ia32_phminposuw128((__v8hi)__V); -} - -/* Handle the sse4.2 definitions here. */ - -/* These definitions are normally in nmmintrin.h, but gcc puts them in here - so we'll do the same. */ - -#undef __DEFAULT_FN_ATTRS -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse4.2"))) - -/* These specify the type of data that we're comparing. */ -#define _SIDD_UBYTE_OPS 0x00 -#define _SIDD_UWORD_OPS 0x01 -#define _SIDD_SBYTE_OPS 0x02 -#define _SIDD_SWORD_OPS 0x03 - -/* These specify the type of comparison operation. */ -#define _SIDD_CMP_EQUAL_ANY 0x00 -#define _SIDD_CMP_RANGES 0x04 -#define _SIDD_CMP_EQUAL_EACH 0x08 -#define _SIDD_CMP_EQUAL_ORDERED 0x0c - -/* These macros specify the polarity of the operation. */ -#define _SIDD_POSITIVE_POLARITY 0x00 -#define _SIDD_NEGATIVE_POLARITY 0x10 -#define _SIDD_MASKED_POSITIVE_POLARITY 0x20 -#define _SIDD_MASKED_NEGATIVE_POLARITY 0x30 - -/* These macros are used in _mm_cmpXstri() to specify the return. */ -#define _SIDD_LEAST_SIGNIFICANT 0x00 -#define _SIDD_MOST_SIGNIFICANT 0x40 - -/* These macros are used in _mm_cmpXstri() to specify the return. */ -#define _SIDD_BIT_MASK 0x00 -#define _SIDD_UNIT_MASK 0x40 - -/* SSE4.2 Packed Comparison Intrinsics. */ -/// Uses the immediate operand \a M to perform a comparison of string -/// data with implicitly defined lengths that is contained in source operands -/// \a A and \a B. Returns a 128-bit integer vector representing the result -/// mask of the comparison. -/// -/// \headerfile -/// -/// \code -/// __m128i _mm_cmpistrm(__m128i A, __m128i B, const int M); -/// \endcode -/// -/// This intrinsic corresponds to the VPCMPISTRM / PCMPISTRM -/// instruction. -/// -/// \param A -/// A 128-bit integer vector containing one of the source operands to be -/// compared. -/// \param B -/// A 128-bit integer vector containing one of the source operands to be -/// compared. -/// \param M -/// An 8-bit immediate operand specifying whether the characters are bytes or -/// words, the type of comparison to perform, and the format of the return -/// value. \n -/// Bits [1:0]: Determine source data format. \n -/// 00: 16 unsigned bytes \n -/// 01: 8 unsigned words \n -/// 10: 16 signed bytes \n -/// 11: 8 signed words \n -/// Bits [3:2]: Determine comparison type and aggregation method. \n -/// 00: Subset: Each character in \a B is compared for equality with all -/// the characters in \a A. \n -/// 01: Ranges: Each character in \a B is compared to \a A. The comparison -/// basis is greater than or equal for even-indexed elements in \a A, -/// and less than or equal for odd-indexed elements in \a A. \n -/// 10: Match: Compare each pair of corresponding characters in \a A and -/// \a B for equality. \n -/// 11: Substring: Search \a B for substring matches of \a A. \n -/// Bits [5:4]: Determine whether to perform a one's complement on the bit -/// mask of the comparison results. \n -/// 00: No effect. \n -/// 01: Negate the bit mask. \n -/// 10: No effect. \n -/// 11: Negate the bit mask only for bits with an index less than or equal -/// to the size of \a A or \a B. \n -/// Bit [6]: Determines whether the result is zero-extended or expanded to 16 -/// bytes. \n -/// 0: The result is zero-extended to 16 bytes. \n -/// 1: The result is expanded to 16 bytes (this expansion is performed by -/// repeating each bit 8 or 16 times). -/// \returns Returns a 128-bit integer vector representing the result mask of -/// the comparison. -#define _mm_cmpistrm(A, B, M) \ - ((__m128i)__builtin_ia32_pcmpistrm128((__v16qi)(__m128i)(A), \ - (__v16qi)(__m128i)(B), (int)(M))) - -/// Uses the immediate operand \a M to perform a comparison of string -/// data with implicitly defined lengths that is contained in source operands -/// \a A and \a B. Returns an integer representing the result index of the -/// comparison. -/// -/// \headerfile -/// -/// \code -/// int _mm_cmpistri(__m128i A, __m128i B, const int M); -/// \endcode -/// -/// This intrinsic corresponds to the VPCMPISTRI / PCMPISTRI -/// instruction. -/// -/// \param A -/// A 128-bit integer vector containing one of the source operands to be -/// compared. -/// \param B -/// A 128-bit integer vector containing one of the source operands to be -/// compared. -/// \param M -/// An 8-bit immediate operand specifying whether the characters are bytes or -/// words, the type of comparison to perform, and the format of the return -/// value. \n -/// Bits [1:0]: Determine source data format. \n -/// 00: 16 unsigned bytes \n -/// 01: 8 unsigned words \n -/// 10: 16 signed bytes \n -/// 11: 8 signed words \n -/// Bits [3:2]: Determine comparison type and aggregation method. \n -/// 00: Subset: Each character in \a B is compared for equality with all -/// the characters in \a A. \n -/// 01: Ranges: Each character in \a B is compared to \a A. The comparison -/// basis is greater than or equal for even-indexed elements in \a A, -/// and less than or equal for odd-indexed elements in \a A. \n -/// 10: Match: Compare each pair of corresponding characters in \a A and -/// \a B for equality. \n -/// 11: Substring: Search B for substring matches of \a A. \n -/// Bits [5:4]: Determine whether to perform a one's complement on the bit -/// mask of the comparison results. \n -/// 00: No effect. \n -/// 01: Negate the bit mask. \n -/// 10: No effect. \n -/// 11: Negate the bit mask only for bits with an index less than or equal -/// to the size of \a A or \a B. \n -/// Bit [6]: Determines whether the index of the lowest set bit or the -/// highest set bit is returned. \n -/// 0: The index of the least significant set bit. \n -/// 1: The index of the most significant set bit. \n -/// \returns Returns an integer representing the result index of the comparison. -#define _mm_cmpistri(A, B, M) \ - ((int)__builtin_ia32_pcmpistri128((__v16qi)(__m128i)(A), \ - (__v16qi)(__m128i)(B), (int)(M))) - -/// Uses the immediate operand \a M to perform a comparison of string -/// data with explicitly defined lengths that is contained in source operands -/// \a A and \a B. Returns a 128-bit integer vector representing the result -/// mask of the comparison. -/// -/// \headerfile -/// -/// \code -/// __m128i _mm_cmpestrm(__m128i A, int LA, __m128i B, int LB, const int M); -/// \endcode -/// -/// This intrinsic corresponds to the VPCMPESTRM / PCMPESTRM -/// instruction. -/// -/// \param A -/// A 128-bit integer vector containing one of the source operands to be -/// compared. -/// \param LA -/// An integer that specifies the length of the string in \a A. -/// \param B -/// A 128-bit integer vector containing one of the source operands to be -/// compared. -/// \param LB -/// An integer that specifies the length of the string in \a B. -/// \param M -/// An 8-bit immediate operand specifying whether the characters are bytes or -/// words, the type of comparison to perform, and the format of the return -/// value. \n -/// Bits [1:0]: Determine source data format. \n -/// 00: 16 unsigned bytes \n -/// 01: 8 unsigned words \n -/// 10: 16 signed bytes \n -/// 11: 8 signed words \n -/// Bits [3:2]: Determine comparison type and aggregation method. \n -/// 00: Subset: Each character in \a B is compared for equality with all -/// the characters in \a A. \n -/// 01: Ranges: Each character in \a B is compared to \a A. The comparison -/// basis is greater than or equal for even-indexed elements in \a A, -/// and less than or equal for odd-indexed elements in \a A. \n -/// 10: Match: Compare each pair of corresponding characters in \a A and -/// \a B for equality. \n -/// 11: Substring: Search \a B for substring matches of \a A. \n -/// Bits [5:4]: Determine whether to perform a one's complement on the bit -/// mask of the comparison results. \n -/// 00: No effect. \n -/// 01: Negate the bit mask. \n -/// 10: No effect. \n -/// 11: Negate the bit mask only for bits with an index less than or equal -/// to the size of \a A or \a B. \n -/// Bit [6]: Determines whether the result is zero-extended or expanded to 16 -/// bytes. \n -/// 0: The result is zero-extended to 16 bytes. \n -/// 1: The result is expanded to 16 bytes (this expansion is performed by -/// repeating each bit 8 or 16 times). \n -/// \returns Returns a 128-bit integer vector representing the result mask of -/// the comparison. -#define _mm_cmpestrm(A, LA, B, LB, M) \ - ((__m128i)__builtin_ia32_pcmpestrm128((__v16qi)(__m128i)(A), (int)(LA), \ - (__v16qi)(__m128i)(B), (int)(LB), \ - (int)(M))) - -/// Uses the immediate operand \a M to perform a comparison of string -/// data with explicitly defined lengths that is contained in source operands -/// \a A and \a B. Returns an integer representing the result index of the -/// comparison. -/// -/// \headerfile -/// -/// \code -/// int _mm_cmpestri(__m128i A, int LA, __m128i B, int LB, const int M); -/// \endcode -/// -/// This intrinsic corresponds to the VPCMPESTRI / PCMPESTRI -/// instruction. -/// -/// \param A -/// A 128-bit integer vector containing one of the source operands to be -/// compared. -/// \param LA -/// An integer that specifies the length of the string in \a A. -/// \param B -/// A 128-bit integer vector containing one of the source operands to be -/// compared. -/// \param LB -/// An integer that specifies the length of the string in \a B. -/// \param M -/// An 8-bit immediate operand specifying whether the characters are bytes or -/// words, the type of comparison to perform, and the format of the return -/// value. \n -/// Bits [1:0]: Determine source data format. \n -/// 00: 16 unsigned bytes \n -/// 01: 8 unsigned words \n -/// 10: 16 signed bytes \n -/// 11: 8 signed words \n -/// Bits [3:2]: Determine comparison type and aggregation method. \n -/// 00: Subset: Each character in \a B is compared for equality with all -/// the characters in \a A. \n -/// 01: Ranges: Each character in \a B is compared to \a A. The comparison -/// basis is greater than or equal for even-indexed elements in \a A, -/// and less than or equal for odd-indexed elements in \a A. \n -/// 10: Match: Compare each pair of corresponding characters in \a A and -/// \a B for equality. \n -/// 11: Substring: Search B for substring matches of \a A. \n -/// Bits [5:4]: Determine whether to perform a one's complement on the bit -/// mask of the comparison results. \n -/// 00: No effect. \n -/// 01: Negate the bit mask. \n -/// 10: No effect. \n -/// 11: Negate the bit mask only for bits with an index less than or equal -/// to the size of \a A or \a B. \n -/// Bit [6]: Determines whether the index of the lowest set bit or the -/// highest set bit is returned. \n -/// 0: The index of the least significant set bit. \n -/// 1: The index of the most significant set bit. \n -/// \returns Returns an integer representing the result index of the comparison. -#define _mm_cmpestri(A, LA, B, LB, M) \ - ((int)__builtin_ia32_pcmpestri128((__v16qi)(__m128i)(A), (int)(LA), \ - (__v16qi)(__m128i)(B), (int)(LB), \ - (int)(M))) - -/* SSE4.2 Packed Comparison Intrinsics and EFlag Reading. */ -/// Uses the immediate operand \a M to perform a comparison of string -/// data with implicitly defined lengths that is contained in source operands -/// \a A and \a B. Returns 1 if the bit mask is zero and the length of the -/// string in \a B is the maximum, otherwise, returns 0. -/// -/// \headerfile -/// -/// \code -/// int _mm_cmpistra(__m128i A, __m128i B, const int M); -/// \endcode -/// -/// This intrinsic corresponds to the VPCMPISTRI / PCMPISTRI -/// instruction. -/// -/// \param A -/// A 128-bit integer vector containing one of the source operands to be -/// compared. -/// \param B -/// A 128-bit integer vector containing one of the source operands to be -/// compared. -/// \param M -/// An 8-bit immediate operand specifying whether the characters are bytes or -/// words and the type of comparison to perform. \n -/// Bits [1:0]: Determine source data format. \n -/// 00: 16 unsigned bytes \n -/// 01: 8 unsigned words \n -/// 10: 16 signed bytes \n -/// 11: 8 signed words \n -/// Bits [3:2]: Determine comparison type and aggregation method. \n -/// 00: Subset: Each character in \a B is compared for equality with all -/// the characters in \a A. \n -/// 01: Ranges: Each character in \a B is compared to \a A. The comparison -/// basis is greater than or equal for even-indexed elements in \a A, -/// and less than or equal for odd-indexed elements in \a A. \n -/// 10: Match: Compare each pair of corresponding characters in \a A and -/// \a B for equality. \n -/// 11: Substring: Search \a B for substring matches of \a A. \n -/// Bits [5:4]: Determine whether to perform a one's complement on the bit -/// mask of the comparison results. \n -/// 00: No effect. \n -/// 01: Negate the bit mask. \n -/// 10: No effect. \n -/// 11: Negate the bit mask only for bits with an index less than or equal -/// to the size of \a A or \a B. \n -/// \returns Returns 1 if the bit mask is zero and the length of the string in -/// \a B is the maximum; otherwise, returns 0. -#define _mm_cmpistra(A, B, M) \ - ((int)__builtin_ia32_pcmpistria128((__v16qi)(__m128i)(A), \ - (__v16qi)(__m128i)(B), (int)(M))) - -/// Uses the immediate operand \a M to perform a comparison of string -/// data with implicitly defined lengths that is contained in source operands -/// \a A and \a B. Returns 1 if the bit mask is non-zero, otherwise, returns -/// 0. -/// -/// \headerfile -/// -/// \code -/// int _mm_cmpistrc(__m128i A, __m128i B, const int M); -/// \endcode -/// -/// This intrinsic corresponds to the VPCMPISTRI / PCMPISTRI -/// instruction. -/// -/// \param A -/// A 128-bit integer vector containing one of the source operands to be -/// compared. -/// \param B -/// A 128-bit integer vector containing one of the source operands to be -/// compared. -/// \param M -/// An 8-bit immediate operand specifying whether the characters are bytes or -/// words and the type of comparison to perform. \n -/// Bits [1:0]: Determine source data format. \n -/// 00: 16 unsigned bytes \n -/// 01: 8 unsigned words \n -/// 10: 16 signed bytes \n -/// 11: 8 signed words \n -/// Bits [3:2]: Determine comparison type and aggregation method. \n -/// 00: Subset: Each character in \a B is compared for equality with all -/// the characters in \a A. \n -/// 01: Ranges: Each character in \a B is compared to \a A. The comparison -/// basis is greater than or equal for even-indexed elements in \a A, -/// and less than or equal for odd-indexed elements in \a A. \n -/// 10: Match: Compare each pair of corresponding characters in \a A and -/// \a B for equality. \n -/// 11: Substring: Search B for substring matches of \a A. \n -/// Bits [5:4]: Determine whether to perform a one's complement on the bit -/// mask of the comparison results. \n -/// 00: No effect. \n -/// 01: Negate the bit mask. \n -/// 10: No effect. \n -/// 11: Negate the bit mask only for bits with an index less than or equal -/// to the size of \a A or \a B. -/// \returns Returns 1 if the bit mask is non-zero, otherwise, returns 0. -#define _mm_cmpistrc(A, B, M) \ - ((int)__builtin_ia32_pcmpistric128((__v16qi)(__m128i)(A), \ - (__v16qi)(__m128i)(B), (int)(M))) - -/// Uses the immediate operand \a M to perform a comparison of string -/// data with implicitly defined lengths that is contained in source operands -/// \a A and \a B. Returns bit 0 of the resulting bit mask. -/// -/// \headerfile -/// -/// \code -/// int _mm_cmpistro(__m128i A, __m128i B, const int M); -/// \endcode -/// -/// This intrinsic corresponds to the VPCMPISTRI / PCMPISTRI -/// instruction. -/// -/// \param A -/// A 128-bit integer vector containing one of the source operands to be -/// compared. -/// \param B -/// A 128-bit integer vector containing one of the source operands to be -/// compared. -/// \param M -/// An 8-bit immediate operand specifying whether the characters are bytes or -/// words and the type of comparison to perform. \n -/// Bits [1:0]: Determine source data format. \n -/// 00: 16 unsigned bytes \n -/// 01: 8 unsigned words \n -/// 10: 16 signed bytes \n -/// 11: 8 signed words \n -/// Bits [3:2]: Determine comparison type and aggregation method. \n -/// 00: Subset: Each character in \a B is compared for equality with all -/// the characters in \a A. \n -/// 01: Ranges: Each character in \a B is compared to \a A. The comparison -/// basis is greater than or equal for even-indexed elements in \a A, -/// and less than or equal for odd-indexed elements in \a A. \n -/// 10: Match: Compare each pair of corresponding characters in \a A and -/// \a B for equality. \n -/// 11: Substring: Search B for substring matches of \a A. \n -/// Bits [5:4]: Determine whether to perform a one's complement on the bit -/// mask of the comparison results. \n -/// 00: No effect. \n -/// 01: Negate the bit mask. \n -/// 10: No effect. \n -/// 11: Negate the bit mask only for bits with an index less than or equal -/// to the size of \a A or \a B. \n -/// \returns Returns bit 0 of the resulting bit mask. -#define _mm_cmpistro(A, B, M) \ - ((int)__builtin_ia32_pcmpistrio128((__v16qi)(__m128i)(A), \ - (__v16qi)(__m128i)(B), (int)(M))) - -/// Uses the immediate operand \a M to perform a comparison of string -/// data with implicitly defined lengths that is contained in source operands -/// \a A and \a B. Returns 1 if the length of the string in \a A is less than -/// the maximum, otherwise, returns 0. -/// -/// \headerfile -/// -/// \code -/// int _mm_cmpistrs(__m128i A, __m128i B, const int M); -/// \endcode -/// -/// This intrinsic corresponds to the VPCMPISTRI / PCMPISTRI -/// instruction. -/// -/// \param A -/// A 128-bit integer vector containing one of the source operands to be -/// compared. -/// \param B -/// A 128-bit integer vector containing one of the source operands to be -/// compared. -/// \param M -/// An 8-bit immediate operand specifying whether the characters are bytes or -/// words and the type of comparison to perform. \n -/// Bits [1:0]: Determine source data format. \n -/// 00: 16 unsigned bytes \n -/// 01: 8 unsigned words \n -/// 10: 16 signed bytes \n -/// 11: 8 signed words \n -/// Bits [3:2]: Determine comparison type and aggregation method. \n -/// 00: Subset: Each character in \a B is compared for equality with all -/// the characters in \a A. \n -/// 01: Ranges: Each character in \a B is compared to \a A. The comparison -/// basis is greater than or equal for even-indexed elements in \a A, -/// and less than or equal for odd-indexed elements in \a A. \n -/// 10: Match: Compare each pair of corresponding characters in \a A and -/// \a B for equality. \n -/// 11: Substring: Search \a B for substring matches of \a A. \n -/// Bits [5:4]: Determine whether to perform a one's complement on the bit -/// mask of the comparison results. \n -/// 00: No effect. \n -/// 01: Negate the bit mask. \n -/// 10: No effect. \n -/// 11: Negate the bit mask only for bits with an index less than or equal -/// to the size of \a A or \a B. \n -/// \returns Returns 1 if the length of the string in \a A is less than the -/// maximum, otherwise, returns 0. -#define _mm_cmpistrs(A, B, M) \ - ((int)__builtin_ia32_pcmpistris128((__v16qi)(__m128i)(A), \ - (__v16qi)(__m128i)(B), (int)(M))) - -/// Uses the immediate operand \a M to perform a comparison of string -/// data with implicitly defined lengths that is contained in source operands -/// \a A and \a B. Returns 1 if the length of the string in \a B is less than -/// the maximum, otherwise, returns 0. -/// -/// \headerfile -/// -/// \code -/// int _mm_cmpistrz(__m128i A, __m128i B, const int M); -/// \endcode -/// -/// This intrinsic corresponds to the VPCMPISTRI / PCMPISTRI -/// instruction. -/// -/// \param A -/// A 128-bit integer vector containing one of the source operands to be -/// compared. -/// \param B -/// A 128-bit integer vector containing one of the source operands to be -/// compared. -/// \param M -/// An 8-bit immediate operand specifying whether the characters are bytes or -/// words and the type of comparison to perform. \n -/// Bits [1:0]: Determine source data format. \n -/// 00: 16 unsigned bytes \n -/// 01: 8 unsigned words \n -/// 10: 16 signed bytes \n -/// 11: 8 signed words \n -/// Bits [3:2]: Determine comparison type and aggregation method. \n -/// 00: Subset: Each character in \a B is compared for equality with all -/// the characters in \a A. \n -/// 01: Ranges: Each character in \a B is compared to \a A. The comparison -/// basis is greater than or equal for even-indexed elements in \a A, -/// and less than or equal for odd-indexed elements in \a A. \n -/// 10: Match: Compare each pair of corresponding characters in \a A and -/// \a B for equality. \n -/// 11: Substring: Search \a B for substring matches of \a A. \n -/// Bits [5:4]: Determine whether to perform a one's complement on the bit -/// mask of the comparison results. \n -/// 00: No effect. \n -/// 01: Negate the bit mask. \n -/// 10: No effect. \n -/// 11: Negate the bit mask only for bits with an index less than or equal -/// to the size of \a A or \a B. -/// \returns Returns 1 if the length of the string in \a B is less than the -/// maximum, otherwise, returns 0. -#define _mm_cmpistrz(A, B, M) \ - ((int)__builtin_ia32_pcmpistriz128((__v16qi)(__m128i)(A), \ - (__v16qi)(__m128i)(B), (int)(M))) - -/// Uses the immediate operand \a M to perform a comparison of string -/// data with explicitly defined lengths that is contained in source operands -/// \a A and \a B. Returns 1 if the bit mask is zero and the length of the -/// string in \a B is the maximum, otherwise, returns 0. -/// -/// \headerfile -/// -/// \code -/// int _mm_cmpestra(__m128i A, int LA, __m128i B, int LB, const int M); -/// \endcode -/// -/// This intrinsic corresponds to the VPCMPESTRI / PCMPESTRI -/// instruction. -/// -/// \param A -/// A 128-bit integer vector containing one of the source operands to be -/// compared. -/// \param LA -/// An integer that specifies the length of the string in \a A. -/// \param B -/// A 128-bit integer vector containing one of the source operands to be -/// compared. -/// \param LB -/// An integer that specifies the length of the string in \a B. -/// \param M -/// An 8-bit immediate operand specifying whether the characters are bytes or -/// words and the type of comparison to perform. \n -/// Bits [1:0]: Determine source data format. \n -/// 00: 16 unsigned bytes \n -/// 01: 8 unsigned words \n -/// 10: 16 signed bytes \n -/// 11: 8 signed words \n -/// Bits [3:2]: Determine comparison type and aggregation method. \n -/// 00: Subset: Each character in \a B is compared for equality with all -/// the characters in \a A. \n -/// 01: Ranges: Each character in \a B is compared to \a A. The comparison -/// basis is greater than or equal for even-indexed elements in \a A, -/// and less than or equal for odd-indexed elements in \a A. \n -/// 10: Match: Compare each pair of corresponding characters in \a A and -/// \a B for equality. \n -/// 11: Substring: Search \a B for substring matches of \a A. \n -/// Bits [5:4]: Determine whether to perform a one's complement on the bit -/// mask of the comparison results. \n -/// 00: No effect. \n -/// 01: Negate the bit mask. \n -/// 10: No effect. \n -/// 11: Negate the bit mask only for bits with an index less than or equal -/// to the size of \a A or \a B. -/// \returns Returns 1 if the bit mask is zero and the length of the string in -/// \a B is the maximum, otherwise, returns 0. -#define _mm_cmpestra(A, LA, B, LB, M) \ - ((int)__builtin_ia32_pcmpestria128((__v16qi)(__m128i)(A), (int)(LA), \ - (__v16qi)(__m128i)(B), (int)(LB), \ - (int)(M))) - -/// Uses the immediate operand \a M to perform a comparison of string -/// data with explicitly defined lengths that is contained in source operands -/// \a A and \a B. Returns 1 if the resulting mask is non-zero, otherwise, -/// returns 0. -/// -/// \headerfile -/// -/// \code -/// int _mm_cmpestrc(__m128i A, int LA, __m128i B, int LB, const int M); -/// \endcode -/// -/// This intrinsic corresponds to the VPCMPESTRI / PCMPESTRI -/// instruction. -/// -/// \param A -/// A 128-bit integer vector containing one of the source operands to be -/// compared. -/// \param LA -/// An integer that specifies the length of the string in \a A. -/// \param B -/// A 128-bit integer vector containing one of the source operands to be -/// compared. -/// \param LB -/// An integer that specifies the length of the string in \a B. -/// \param M -/// An 8-bit immediate operand specifying whether the characters are bytes or -/// words and the type of comparison to perform. \n -/// Bits [1:0]: Determine source data format. \n -/// 00: 16 unsigned bytes \n -/// 01: 8 unsigned words \n -/// 10: 16 signed bytes \n -/// 11: 8 signed words \n -/// Bits [3:2]: Determine comparison type and aggregation method. \n -/// 00: Subset: Each character in \a B is compared for equality with all -/// the characters in \a A. \n -/// 01: Ranges: Each character in \a B is compared to \a A. The comparison -/// basis is greater than or equal for even-indexed elements in \a A, -/// and less than or equal for odd-indexed elements in \a A. \n -/// 10: Match: Compare each pair of corresponding characters in \a A and -/// \a B for equality. \n -/// 11: Substring: Search \a B for substring matches of \a A. \n -/// Bits [5:4]: Determine whether to perform a one's complement on the bit -/// mask of the comparison results. \n -/// 00: No effect. \n -/// 01: Negate the bit mask. \n -/// 10: No effect. \n -/// 11: Negate the bit mask only for bits with an index less than or equal -/// to the size of \a A or \a B. \n -/// \returns Returns 1 if the resulting mask is non-zero, otherwise, returns 0. -#define _mm_cmpestrc(A, LA, B, LB, M) \ - ((int)__builtin_ia32_pcmpestric128((__v16qi)(__m128i)(A), (int)(LA), \ - (__v16qi)(__m128i)(B), (int)(LB), \ - (int)(M))) - -/// Uses the immediate operand \a M to perform a comparison of string -/// data with explicitly defined lengths that is contained in source operands -/// \a A and \a B. Returns bit 0 of the resulting bit mask. -/// -/// \headerfile -/// -/// \code -/// int _mm_cmpestro(__m128i A, int LA, __m128i B, int LB, const int M); -/// \endcode -/// -/// This intrinsic corresponds to the VPCMPESTRI / PCMPESTRI -/// instruction. -/// -/// \param A -/// A 128-bit integer vector containing one of the source operands to be -/// compared. -/// \param LA -/// An integer that specifies the length of the string in \a A. -/// \param B -/// A 128-bit integer vector containing one of the source operands to be -/// compared. -/// \param LB -/// An integer that specifies the length of the string in \a B. -/// \param M -/// An 8-bit immediate operand specifying whether the characters are bytes or -/// words and the type of comparison to perform. \n -/// Bits [1:0]: Determine source data format. \n -/// 00: 16 unsigned bytes \n -/// 01: 8 unsigned words \n -/// 10: 16 signed bytes \n -/// 11: 8 signed words \n -/// Bits [3:2]: Determine comparison type and aggregation method. \n -/// 00: Subset: Each character in \a B is compared for equality with all -/// the characters in \a A. \n -/// 01: Ranges: Each character in \a B is compared to \a A. The comparison -/// basis is greater than or equal for even-indexed elements in \a A, -/// and less than or equal for odd-indexed elements in \a A. \n -/// 10: Match: Compare each pair of corresponding characters in \a A and -/// \a B for equality. \n -/// 11: Substring: Search \a B for substring matches of \a A. \n -/// Bits [5:4]: Determine whether to perform a one's complement on the bit -/// mask of the comparison results. \n -/// 00: No effect. \n -/// 01: Negate the bit mask. \n -/// 10: No effect. \n -/// 11: Negate the bit mask only for bits with an index less than or equal -/// to the size of \a A or \a B. -/// \returns Returns bit 0 of the resulting bit mask. -#define _mm_cmpestro(A, LA, B, LB, M) \ - ((int)__builtin_ia32_pcmpestrio128((__v16qi)(__m128i)(A), (int)(LA), \ - (__v16qi)(__m128i)(B), (int)(LB), \ - (int)(M))) - -/// Uses the immediate operand \a M to perform a comparison of string -/// data with explicitly defined lengths that is contained in source operands -/// \a A and \a B. Returns 1 if the length of the string in \a A is less than -/// the maximum, otherwise, returns 0. -/// -/// \headerfile -/// -/// \code -/// int _mm_cmpestrs(__m128i A, int LA, __m128i B, int LB, const int M); -/// \endcode -/// -/// This intrinsic corresponds to the VPCMPESTRI / PCMPESTRI -/// instruction. -/// -/// \param A -/// A 128-bit integer vector containing one of the source operands to be -/// compared. -/// \param LA -/// An integer that specifies the length of the string in \a A. -/// \param B -/// A 128-bit integer vector containing one of the source operands to be -/// compared. -/// \param LB -/// An integer that specifies the length of the string in \a B. -/// \param M -/// An 8-bit immediate operand specifying whether the characters are bytes or -/// words and the type of comparison to perform. \n -/// Bits [1:0]: Determine source data format. \n -/// 00: 16 unsigned bytes \n -/// 01: 8 unsigned words \n -/// 10: 16 signed bytes \n -/// 11: 8 signed words \n -/// Bits [3:2]: Determine comparison type and aggregation method. \n -/// 00: Subset: Each character in \a B is compared for equality with all -/// the characters in \a A. \n -/// 01: Ranges: Each character in \a B is compared to \a A. The comparison -/// basis is greater than or equal for even-indexed elements in \a A, -/// and less than or equal for odd-indexed elements in \a A. \n -/// 10: Match: Compare each pair of corresponding characters in \a A and -/// \a B for equality. \n -/// 11: Substring: Search \a B for substring matches of \a A. \n -/// Bits [5:4]: Determine whether to perform a one's complement in the bit -/// mask of the comparison results. \n -/// 00: No effect. \n -/// 01: Negate the bit mask. \n -/// 10: No effect. \n -/// 11: Negate the bit mask only for bits with an index less than or equal -/// to the size of \a A or \a B. \n -/// \returns Returns 1 if the length of the string in \a A is less than the -/// maximum, otherwise, returns 0. -#define _mm_cmpestrs(A, LA, B, LB, M) \ - ((int)__builtin_ia32_pcmpestris128((__v16qi)(__m128i)(A), (int)(LA), \ - (__v16qi)(__m128i)(B), (int)(LB), \ - (int)(M))) - -/// Uses the immediate operand \a M to perform a comparison of string -/// data with explicitly defined lengths that is contained in source operands -/// \a A and \a B. Returns 1 if the length of the string in \a B is less than -/// the maximum, otherwise, returns 0. -/// -/// \headerfile -/// -/// \code -/// int _mm_cmpestrz(__m128i A, int LA, __m128i B, int LB, const int M); -/// \endcode -/// -/// This intrinsic corresponds to the VPCMPESTRI instruction. -/// -/// \param A -/// A 128-bit integer vector containing one of the source operands to be -/// compared. -/// \param LA -/// An integer that specifies the length of the string in \a A. -/// \param B -/// A 128-bit integer vector containing one of the source operands to be -/// compared. -/// \param LB -/// An integer that specifies the length of the string in \a B. -/// \param M -/// An 8-bit immediate operand specifying whether the characters are bytes or -/// words and the type of comparison to perform. \n -/// Bits [1:0]: Determine source data format. \n -/// 00: 16 unsigned bytes \n -/// 01: 8 unsigned words \n -/// 10: 16 signed bytes \n -/// 11: 8 signed words \n -/// Bits [3:2]: Determine comparison type and aggregation method. \n -/// 00: Subset: Each character in \a B is compared for equality with all -/// the characters in \a A. \n -/// 01: Ranges: Each character in \a B is compared to \a A. The comparison -/// basis is greater than or equal for even-indexed elements in \a A, -/// and less than or equal for odd-indexed elements in \a A. \n -/// 10: Match: Compare each pair of corresponding characters in \a A and -/// \a B for equality. \n -/// 11: Substring: Search \a B for substring matches of \a A. \n -/// Bits [5:4]: Determine whether to perform a one's complement on the bit -/// mask of the comparison results. \n -/// 00: No effect. \n -/// 01: Negate the bit mask. \n -/// 10: No effect. \n -/// 11: Negate the bit mask only for bits with an index less than or equal -/// to the size of \a A or \a B. -/// \returns Returns 1 if the length of the string in \a B is less than the -/// maximum, otherwise, returns 0. -#define _mm_cmpestrz(A, LA, B, LB, M) \ - ((int)__builtin_ia32_pcmpestriz128((__v16qi)(__m128i)(A), (int)(LA), \ - (__v16qi)(__m128i)(B), (int)(LB), \ - (int)(M))) - -/* SSE4.2 Compare Packed Data -- Greater Than. */ -/// Compares each of the corresponding 64-bit values of the 128-bit -/// integer vectors to determine if the values in the first operand are -/// greater than those in the second operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPCMPGTQ / PCMPGTQ instruction. -/// -/// \param __V1 -/// A 128-bit integer vector. -/// \param __V2 -/// A 128-bit integer vector. -/// \returns A 128-bit integer vector containing the comparison results. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_cmpgt_epi64(__m128i __V1, __m128i __V2) -{ - return (__m128i)((__v2di)__V1 > (__v2di)__V2); -} - -#undef __DEFAULT_FN_ATTRS - -#include - -#include - -#endif /* __SMMINTRIN_H */ diff --git a/include/tbmintrin.h b/include/tbmintrin.h deleted file mode 100644 index f4e848a..0000000 --- a/include/tbmintrin.h +++ /dev/null @@ -1,140 +0,0 @@ -/*===---- tbmintrin.h - TBM intrinsics -------------------------------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ - -#ifndef __X86INTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __TBMINTRIN_H -#define __TBMINTRIN_H - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("tbm"))) - -#define __bextri_u32(a, b) \ - ((unsigned int)__builtin_ia32_bextri_u32((unsigned int)(a), \ - (unsigned int)(b))) - -static __inline__ unsigned int __DEFAULT_FN_ATTRS -__blcfill_u32(unsigned int __a) -{ - return __a & (__a + 1); -} - -static __inline__ unsigned int __DEFAULT_FN_ATTRS -__blci_u32(unsigned int __a) -{ - return __a | ~(__a + 1); -} - -static __inline__ unsigned int __DEFAULT_FN_ATTRS -__blcic_u32(unsigned int __a) -{ - return ~__a & (__a + 1); -} - -static __inline__ unsigned int __DEFAULT_FN_ATTRS -__blcmsk_u32(unsigned int __a) -{ - return __a ^ (__a + 1); -} - -static __inline__ unsigned int __DEFAULT_FN_ATTRS -__blcs_u32(unsigned int __a) -{ - return __a | (__a + 1); -} - -static __inline__ unsigned int __DEFAULT_FN_ATTRS -__blsfill_u32(unsigned int __a) -{ - return __a | (__a - 1); -} - -static __inline__ unsigned int __DEFAULT_FN_ATTRS -__blsic_u32(unsigned int __a) -{ - return ~__a | (__a - 1); -} - -static __inline__ unsigned int __DEFAULT_FN_ATTRS -__t1mskc_u32(unsigned int __a) -{ - return ~__a | (__a + 1); -} - -static __inline__ unsigned int __DEFAULT_FN_ATTRS -__tzmsk_u32(unsigned int __a) -{ - return ~__a & (__a - 1); -} - -#ifdef __x86_64__ -#define __bextri_u64(a, b) \ - ((unsigned long long)__builtin_ia32_bextri_u64((unsigned long long)(a), \ - (unsigned long long)(b))) - -static __inline__ unsigned long long __DEFAULT_FN_ATTRS -__blcfill_u64(unsigned long long __a) -{ - return __a & (__a + 1); -} - -static __inline__ unsigned long long __DEFAULT_FN_ATTRS -__blci_u64(unsigned long long __a) -{ - return __a | ~(__a + 1); -} - -static __inline__ unsigned long long __DEFAULT_FN_ATTRS -__blcic_u64(unsigned long long __a) -{ - return ~__a & (__a + 1); -} - -static __inline__ unsigned long long __DEFAULT_FN_ATTRS -__blcmsk_u64(unsigned long long __a) -{ - return __a ^ (__a + 1); -} - -static __inline__ unsigned long long __DEFAULT_FN_ATTRS -__blcs_u64(unsigned long long __a) -{ - return __a | (__a + 1); -} - -static __inline__ unsigned long long __DEFAULT_FN_ATTRS -__blsfill_u64(unsigned long long __a) -{ - return __a | (__a - 1); -} - -static __inline__ unsigned long long __DEFAULT_FN_ATTRS -__blsic_u64(unsigned long long __a) -{ - return ~__a | (__a - 1); -} - -static __inline__ unsigned long long __DEFAULT_FN_ATTRS -__t1mskc_u64(unsigned long long __a) -{ - return ~__a | (__a + 1); -} - -static __inline__ unsigned long long __DEFAULT_FN_ATTRS -__tzmsk_u64(unsigned long long __a) -{ - return ~__a & (__a - 1); -} -#endif - -#undef __DEFAULT_FN_ATTRS - -#endif /* __TBMINTRIN_H */ diff --git a/include/tmmintrin.h b/include/tmmintrin.h deleted file mode 100644 index e640934..0000000 --- a/include/tmmintrin.h +++ /dev/null @@ -1,787 +0,0 @@ -/*===---- tmmintrin.h - SSSE3 intrinsics -----------------------------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ - -#ifndef __TMMINTRIN_H -#define __TMMINTRIN_H - -#if !defined(__i386__) && !defined(__x86_64__) -#error "This header is only meant to be used on x86 and x64 architecture" -#endif - -#include - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("ssse3"), __min_vector_width__(64))) -#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,ssse3"), __min_vector_width__(64))) - -/// Computes the absolute value of each of the packed 8-bit signed -/// integers in the source operand and stores the 8-bit unsigned integer -/// results in the destination. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the \c PABSB instruction. -/// -/// \param __a -/// A 64-bit vector of [8 x i8]. -/// \returns A 64-bit integer vector containing the absolute values of the -/// elements in the operand. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX -_mm_abs_pi8(__m64 __a) -{ - return (__m64)__builtin_ia32_pabsb((__v8qi)__a); -} - -/// Computes the absolute value of each of the packed 8-bit signed -/// integers in the source operand and stores the 8-bit unsigned integer -/// results in the destination. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the \c VPABSB instruction. -/// -/// \param __a -/// A 128-bit vector of [16 x i8]. -/// \returns A 128-bit integer vector containing the absolute values of the -/// elements in the operand. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_abs_epi8(__m128i __a) -{ -#if (__clang_major__ < 14) - return (__m128i)__builtin_ia32_pabsb128((__v16qi)__a); -#else - return (__m128i)__builtin_elementwise_abs((__v16qs)__a); -#endif -} - -/// Computes the absolute value of each of the packed 16-bit signed -/// integers in the source operand and stores the 16-bit unsigned integer -/// results in the destination. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the \c PABSW instruction. -/// -/// \param __a -/// A 64-bit vector of [4 x i16]. -/// \returns A 64-bit integer vector containing the absolute values of the -/// elements in the operand. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX -_mm_abs_pi16(__m64 __a) -{ - return (__m64)__builtin_ia32_pabsw((__v4hi)__a); -} - -/// Computes the absolute value of each of the packed 16-bit signed -/// integers in the source operand and stores the 16-bit unsigned integer -/// results in the destination. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the \c VPABSW instruction. -/// -/// \param __a -/// A 128-bit vector of [8 x i16]. -/// \returns A 128-bit integer vector containing the absolute values of the -/// elements in the operand. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_abs_epi16(__m128i __a) -{ -#if (__clang_major__ < 14) - return (__m128i)__builtin_ia32_pabsw128((__v8hi)__a); -#else - return (__m128i)__builtin_elementwise_abs((__v8hi)__a); -#endif -} - -/// Computes the absolute value of each of the packed 32-bit signed -/// integers in the source operand and stores the 32-bit unsigned integer -/// results in the destination. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the \c PABSD instruction. -/// -/// \param __a -/// A 64-bit vector of [2 x i32]. -/// \returns A 64-bit integer vector containing the absolute values of the -/// elements in the operand. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX -_mm_abs_pi32(__m64 __a) -{ - return (__m64)__builtin_ia32_pabsd((__v2si)__a); -} - -/// Computes the absolute value of each of the packed 32-bit signed -/// integers in the source operand and stores the 32-bit unsigned integer -/// results in the destination. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the \c VPABSD instruction. -/// -/// \param __a -/// A 128-bit vector of [4 x i32]. -/// \returns A 128-bit integer vector containing the absolute values of the -/// elements in the operand. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_abs_epi32(__m128i __a) -{ -#if (__clang_major__ < 14) - return (__m128i)__builtin_ia32_pabsd128((__v4si)__a); -#else - return (__m128i)__builtin_elementwise_abs((__v4si)__a); -#endif -} - -/// Concatenates the two 128-bit integer vector operands, and -/// right-shifts the result by the number of bytes specified in the immediate -/// operand. -/// -/// \headerfile -/// -/// \code -/// __m128i _mm_alignr_epi8(__m128i a, __m128i b, const int n); -/// \endcode -/// -/// This intrinsic corresponds to the \c PALIGNR instruction. -/// -/// \param a -/// A 128-bit vector of [16 x i8] containing one of the source operands. -/// \param b -/// A 128-bit vector of [16 x i8] containing one of the source operands. -/// \param n -/// An immediate operand specifying how many bytes to right-shift the result. -/// \returns A 128-bit integer vector containing the concatenated right-shifted -/// value. -#define _mm_alignr_epi8(a, b, n) \ - ((__m128i)__builtin_ia32_palignr128((__v16qi)(__m128i)(a), \ - (__v16qi)(__m128i)(b), (n))) - -/// Concatenates the two 64-bit integer vector operands, and right-shifts -/// the result by the number of bytes specified in the immediate operand. -/// -/// \headerfile -/// -/// \code -/// __m64 _mm_alignr_pi8(__m64 a, __m64 b, const int n); -/// \endcode -/// -/// This intrinsic corresponds to the \c PALIGNR instruction. -/// -/// \param a -/// A 64-bit vector of [8 x i8] containing one of the source operands. -/// \param b -/// A 64-bit vector of [8 x i8] containing one of the source operands. -/// \param n -/// An immediate operand specifying how many bytes to right-shift the result. -/// \returns A 64-bit integer vector containing the concatenated right-shifted -/// value. -#define _mm_alignr_pi8(a, b, n) \ - ((__m64)__builtin_ia32_palignr((__v8qi)(__m64)(a), (__v8qi)(__m64)(b), (n))) - -/// Horizontally adds the adjacent pairs of values contained in 2 packed -/// 128-bit vectors of [8 x i16]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the \c VPHADDW instruction. -/// -/// \param __a -/// A 128-bit vector of [8 x i16] containing one of the source operands. The -/// horizontal sums of the values are stored in the lower bits of the -/// destination. -/// \param __b -/// A 128-bit vector of [8 x i16] containing one of the source operands. The -/// horizontal sums of the values are stored in the upper bits of the -/// destination. -/// \returns A 128-bit vector of [8 x i16] containing the horizontal sums of -/// both operands. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_hadd_epi16(__m128i __a, __m128i __b) -{ - return (__m128i)__builtin_ia32_phaddw128((__v8hi)__a, (__v8hi)__b); -} - -/// Horizontally adds the adjacent pairs of values contained in 2 packed -/// 128-bit vectors of [4 x i32]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the \c VPHADDD instruction. -/// -/// \param __a -/// A 128-bit vector of [4 x i32] containing one of the source operands. The -/// horizontal sums of the values are stored in the lower bits of the -/// destination. -/// \param __b -/// A 128-bit vector of [4 x i32] containing one of the source operands. The -/// horizontal sums of the values are stored in the upper bits of the -/// destination. -/// \returns A 128-bit vector of [4 x i32] containing the horizontal sums of -/// both operands. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_hadd_epi32(__m128i __a, __m128i __b) -{ - return (__m128i)__builtin_ia32_phaddd128((__v4si)__a, (__v4si)__b); -} - -/// Horizontally adds the adjacent pairs of values contained in 2 packed -/// 64-bit vectors of [4 x i16]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the \c PHADDW instruction. -/// -/// \param __a -/// A 64-bit vector of [4 x i16] containing one of the source operands. The -/// horizontal sums of the values are stored in the lower bits of the -/// destination. -/// \param __b -/// A 64-bit vector of [4 x i16] containing one of the source operands. The -/// horizontal sums of the values are stored in the upper bits of the -/// destination. -/// \returns A 64-bit vector of [4 x i16] containing the horizontal sums of both -/// operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX -_mm_hadd_pi16(__m64 __a, __m64 __b) -{ - return (__m64)__builtin_ia32_phaddw((__v4hi)__a, (__v4hi)__b); -} - -/// Horizontally adds the adjacent pairs of values contained in 2 packed -/// 64-bit vectors of [2 x i32]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the \c PHADDD instruction. -/// -/// \param __a -/// A 64-bit vector of [2 x i32] containing one of the source operands. The -/// horizontal sums of the values are stored in the lower bits of the -/// destination. -/// \param __b -/// A 64-bit vector of [2 x i32] containing one of the source operands. The -/// horizontal sums of the values are stored in the upper bits of the -/// destination. -/// \returns A 64-bit vector of [2 x i32] containing the horizontal sums of both -/// operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX -_mm_hadd_pi32(__m64 __a, __m64 __b) -{ - return (__m64)__builtin_ia32_phaddd((__v2si)__a, (__v2si)__b); -} - -/// Horizontally adds the adjacent pairs of values contained in 2 packed -/// 128-bit vectors of [8 x i16]. Positive sums greater than 0x7FFF are -/// saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to -/// 0x8000. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the \c VPHADDSW instruction. -/// -/// \param __a -/// A 128-bit vector of [8 x i16] containing one of the source operands. The -/// horizontal sums of the values are stored in the lower bits of the -/// destination. -/// \param __b -/// A 128-bit vector of [8 x i16] containing one of the source operands. The -/// horizontal sums of the values are stored in the upper bits of the -/// destination. -/// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated -/// sums of both operands. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_hadds_epi16(__m128i __a, __m128i __b) -{ - return (__m128i)__builtin_ia32_phaddsw128((__v8hi)__a, (__v8hi)__b); -} - -/// Horizontally adds the adjacent pairs of values contained in 2 packed -/// 64-bit vectors of [4 x i16]. Positive sums greater than 0x7FFF are -/// saturated to 0x7FFF. Negative sums less than 0x8000 are saturated to -/// 0x8000. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the \c PHADDSW instruction. -/// -/// \param __a -/// A 64-bit vector of [4 x i16] containing one of the source operands. The -/// horizontal sums of the values are stored in the lower bits of the -/// destination. -/// \param __b -/// A 64-bit vector of [4 x i16] containing one of the source operands. The -/// horizontal sums of the values are stored in the upper bits of the -/// destination. -/// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated -/// sums of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX -_mm_hadds_pi16(__m64 __a, __m64 __b) -{ - return (__m64)__builtin_ia32_phaddsw((__v4hi)__a, (__v4hi)__b); -} - -/// Horizontally subtracts the adjacent pairs of values contained in 2 -/// packed 128-bit vectors of [8 x i16]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the \c VPHSUBW instruction. -/// -/// \param __a -/// A 128-bit vector of [8 x i16] containing one of the source operands. The -/// horizontal differences between the values are stored in the lower bits of -/// the destination. -/// \param __b -/// A 128-bit vector of [8 x i16] containing one of the source operands. The -/// horizontal differences between the values are stored in the upper bits of -/// the destination. -/// \returns A 128-bit vector of [8 x i16] containing the horizontal differences -/// of both operands. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_hsub_epi16(__m128i __a, __m128i __b) -{ - return (__m128i)__builtin_ia32_phsubw128((__v8hi)__a, (__v8hi)__b); -} - -/// Horizontally subtracts the adjacent pairs of values contained in 2 -/// packed 128-bit vectors of [4 x i32]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the \c VPHSUBD instruction. -/// -/// \param __a -/// A 128-bit vector of [4 x i32] containing one of the source operands. The -/// horizontal differences between the values are stored in the lower bits of -/// the destination. -/// \param __b -/// A 128-bit vector of [4 x i32] containing one of the source operands. The -/// horizontal differences between the values are stored in the upper bits of -/// the destination. -/// \returns A 128-bit vector of [4 x i32] containing the horizontal differences -/// of both operands. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_hsub_epi32(__m128i __a, __m128i __b) -{ - return (__m128i)__builtin_ia32_phsubd128((__v4si)__a, (__v4si)__b); -} - -/// Horizontally subtracts the adjacent pairs of values contained in 2 -/// packed 64-bit vectors of [4 x i16]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the \c PHSUBW instruction. -/// -/// \param __a -/// A 64-bit vector of [4 x i16] containing one of the source operands. The -/// horizontal differences between the values are stored in the lower bits of -/// the destination. -/// \param __b -/// A 64-bit vector of [4 x i16] containing one of the source operands. The -/// horizontal differences between the values are stored in the upper bits of -/// the destination. -/// \returns A 64-bit vector of [4 x i16] containing the horizontal differences -/// of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX -_mm_hsub_pi16(__m64 __a, __m64 __b) -{ - return (__m64)__builtin_ia32_phsubw((__v4hi)__a, (__v4hi)__b); -} - -/// Horizontally subtracts the adjacent pairs of values contained in 2 -/// packed 64-bit vectors of [2 x i32]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the \c PHSUBD instruction. -/// -/// \param __a -/// A 64-bit vector of [2 x i32] containing one of the source operands. The -/// horizontal differences between the values are stored in the lower bits of -/// the destination. -/// \param __b -/// A 64-bit vector of [2 x i32] containing one of the source operands. The -/// horizontal differences between the values are stored in the upper bits of -/// the destination. -/// \returns A 64-bit vector of [2 x i32] containing the horizontal differences -/// of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX -_mm_hsub_pi32(__m64 __a, __m64 __b) -{ - return (__m64)__builtin_ia32_phsubd((__v2si)__a, (__v2si)__b); -} - -/// Horizontally subtracts the adjacent pairs of values contained in 2 -/// packed 128-bit vectors of [8 x i16]. Positive differences greater than -/// 0x7FFF are saturated to 0x7FFF. Negative differences less than 0x8000 are -/// saturated to 0x8000. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the \c VPHSUBSW instruction. -/// -/// \param __a -/// A 128-bit vector of [8 x i16] containing one of the source operands. The -/// horizontal differences between the values are stored in the lower bits of -/// the destination. -/// \param __b -/// A 128-bit vector of [8 x i16] containing one of the source operands. The -/// horizontal differences between the values are stored in the upper bits of -/// the destination. -/// \returns A 128-bit vector of [8 x i16] containing the horizontal saturated -/// differences of both operands. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_hsubs_epi16(__m128i __a, __m128i __b) -{ - return (__m128i)__builtin_ia32_phsubsw128((__v8hi)__a, (__v8hi)__b); -} - -/// Horizontally subtracts the adjacent pairs of values contained in 2 -/// packed 64-bit vectors of [4 x i16]. Positive differences greater than -/// 0x7FFF are saturated to 0x7FFF. Negative differences less than 0x8000 are -/// saturated to 0x8000. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the \c PHSUBSW instruction. -/// -/// \param __a -/// A 64-bit vector of [4 x i16] containing one of the source operands. The -/// horizontal differences between the values are stored in the lower bits of -/// the destination. -/// \param __b -/// A 64-bit vector of [4 x i16] containing one of the source operands. The -/// horizontal differences between the values are stored in the upper bits of -/// the destination. -/// \returns A 64-bit vector of [4 x i16] containing the horizontal saturated -/// differences of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX -_mm_hsubs_pi16(__m64 __a, __m64 __b) -{ - return (__m64)__builtin_ia32_phsubsw((__v4hi)__a, (__v4hi)__b); -} - -/// Multiplies corresponding pairs of packed 8-bit unsigned integer -/// values contained in the first source operand and packed 8-bit signed -/// integer values contained in the second source operand, adds pairs of -/// contiguous products with signed saturation, and writes the 16-bit sums to -/// the corresponding bits in the destination. -/// -/// For example, bits [7:0] of both operands are multiplied, bits [15:8] of -/// both operands are multiplied, and the sum of both results is written to -/// bits [15:0] of the destination. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the \c VPMADDUBSW instruction. -/// -/// \param __a -/// A 128-bit integer vector containing the first source operand. -/// \param __b -/// A 128-bit integer vector containing the second source operand. -/// \returns A 128-bit integer vector containing the sums of products of both -/// operands: \n -/// \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n -/// \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n -/// \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n -/// \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7) \n -/// \a R4 := (\a __a8 * \a __b8) + (\a __a9 * \a __b9) \n -/// \a R5 := (\a __a10 * \a __b10) + (\a __a11 * \a __b11) \n -/// \a R6 := (\a __a12 * \a __b12) + (\a __a13 * \a __b13) \n -/// \a R7 := (\a __a14 * \a __b14) + (\a __a15 * \a __b15) -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maddubs_epi16(__m128i __a, __m128i __b) -{ - return (__m128i)__builtin_ia32_pmaddubsw128((__v16qi)__a, (__v16qi)__b); -} - -/// Multiplies corresponding pairs of packed 8-bit unsigned integer -/// values contained in the first source operand and packed 8-bit signed -/// integer values contained in the second source operand, adds pairs of -/// contiguous products with signed saturation, and writes the 16-bit sums to -/// the corresponding bits in the destination. -/// -/// For example, bits [7:0] of both operands are multiplied, bits [15:8] of -/// both operands are multiplied, and the sum of both results is written to -/// bits [15:0] of the destination. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the \c PMADDUBSW instruction. -/// -/// \param __a -/// A 64-bit integer vector containing the first source operand. -/// \param __b -/// A 64-bit integer vector containing the second source operand. -/// \returns A 64-bit integer vector containing the sums of products of both -/// operands: \n -/// \a R0 := (\a __a0 * \a __b0) + (\a __a1 * \a __b1) \n -/// \a R1 := (\a __a2 * \a __b2) + (\a __a3 * \a __b3) \n -/// \a R2 := (\a __a4 * \a __b4) + (\a __a5 * \a __b5) \n -/// \a R3 := (\a __a6 * \a __b6) + (\a __a7 * \a __b7) -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX -_mm_maddubs_pi16(__m64 __a, __m64 __b) -{ - return (__m64)__builtin_ia32_pmaddubsw((__v8qi)__a, (__v8qi)__b); -} - -/// Multiplies packed 16-bit signed integer values, truncates the 32-bit -/// products to the 18 most significant bits by right-shifting, rounds the -/// truncated value by adding 1, and writes bits [16:1] to the destination. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the \c VPMULHRSW instruction. -/// -/// \param __a -/// A 128-bit vector of [8 x i16] containing one of the source operands. -/// \param __b -/// A 128-bit vector of [8 x i16] containing one of the source operands. -/// \returns A 128-bit vector of [8 x i16] containing the rounded and scaled -/// products of both operands. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_mulhrs_epi16(__m128i __a, __m128i __b) -{ - return (__m128i)__builtin_ia32_pmulhrsw128((__v8hi)__a, (__v8hi)__b); -} - -/// Multiplies packed 16-bit signed integer values, truncates the 32-bit -/// products to the 18 most significant bits by right-shifting, rounds the -/// truncated value by adding 1, and writes bits [16:1] to the destination. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the \c PMULHRSW instruction. -/// -/// \param __a -/// A 64-bit vector of [4 x i16] containing one of the source operands. -/// \param __b -/// A 64-bit vector of [4 x i16] containing one of the source operands. -/// \returns A 64-bit vector of [4 x i16] containing the rounded and scaled -/// products of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX -_mm_mulhrs_pi16(__m64 __a, __m64 __b) -{ - return (__m64)__builtin_ia32_pmulhrsw((__v4hi)__a, (__v4hi)__b); -} - -/// Copies the 8-bit integers from a 128-bit integer vector to the -/// destination or clears 8-bit values in the destination, as specified by -/// the second source operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the \c VPSHUFB instruction. -/// -/// \param __a -/// A 128-bit integer vector containing the values to be copied. -/// \param __b -/// A 128-bit integer vector containing control bytes corresponding to -/// positions in the destination: -/// Bit 7: \n -/// 1: Clear the corresponding byte in the destination. \n -/// 0: Copy the selected source byte to the corresponding byte in the -/// destination. \n -/// Bits [6:4] Reserved. \n -/// Bits [3:0] select the source byte to be copied. -/// \returns A 128-bit integer vector containing the copied or cleared values. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_shuffle_epi8(__m128i __a, __m128i __b) -{ - return (__m128i)__builtin_ia32_pshufb128((__v16qi)__a, (__v16qi)__b); -} - -/// Copies the 8-bit integers from a 64-bit integer vector to the -/// destination or clears 8-bit values in the destination, as specified by -/// the second source operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the \c PSHUFB instruction. -/// -/// \param __a -/// A 64-bit integer vector containing the values to be copied. -/// \param __b -/// A 64-bit integer vector containing control bytes corresponding to -/// positions in the destination: -/// Bit 7: \n -/// 1: Clear the corresponding byte in the destination. \n -/// 0: Copy the selected source byte to the corresponding byte in the -/// destination. \n -/// Bits [3:0] select the source byte to be copied. -/// \returns A 64-bit integer vector containing the copied or cleared values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX -_mm_shuffle_pi8(__m64 __a, __m64 __b) -{ - return (__m64)__builtin_ia32_pshufb((__v8qi)__a, (__v8qi)__b); -} - -/// For each 8-bit integer in the first source operand, perform one of -/// the following actions as specified by the second source operand. -/// -/// If the byte in the second source is negative, calculate the two's -/// complement of the corresponding byte in the first source, and write that -/// value to the destination. If the byte in the second source is positive, -/// copy the corresponding byte from the first source to the destination. If -/// the byte in the second source is zero, clear the corresponding byte in -/// the destination. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the \c VPSIGNB instruction. -/// -/// \param __a -/// A 128-bit integer vector containing the values to be copied. -/// \param __b -/// A 128-bit integer vector containing control bytes corresponding to -/// positions in the destination. -/// \returns A 128-bit integer vector containing the resultant values. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_sign_epi8(__m128i __a, __m128i __b) -{ - return (__m128i)__builtin_ia32_psignb128((__v16qi)__a, (__v16qi)__b); -} - -/// For each 16-bit integer in the first source operand, perform one of -/// the following actions as specified by the second source operand. -/// -/// If the word in the second source is negative, calculate the two's -/// complement of the corresponding word in the first source, and write that -/// value to the destination. If the word in the second source is positive, -/// copy the corresponding word from the first source to the destination. If -/// the word in the second source is zero, clear the corresponding word in -/// the destination. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the \c VPSIGNW instruction. -/// -/// \param __a -/// A 128-bit integer vector containing the values to be copied. -/// \param __b -/// A 128-bit integer vector containing control words corresponding to -/// positions in the destination. -/// \returns A 128-bit integer vector containing the resultant values. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_sign_epi16(__m128i __a, __m128i __b) -{ - return (__m128i)__builtin_ia32_psignw128((__v8hi)__a, (__v8hi)__b); -} - -/// For each 32-bit integer in the first source operand, perform one of -/// the following actions as specified by the second source operand. -/// -/// If the doubleword in the second source is negative, calculate the two's -/// complement of the corresponding word in the first source, and write that -/// value to the destination. If the doubleword in the second source is -/// positive, copy the corresponding word from the first source to the -/// destination. If the doubleword in the second source is zero, clear the -/// corresponding word in the destination. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the \c VPSIGND instruction. -/// -/// \param __a -/// A 128-bit integer vector containing the values to be copied. -/// \param __b -/// A 128-bit integer vector containing control doublewords corresponding to -/// positions in the destination. -/// \returns A 128-bit integer vector containing the resultant values. -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_sign_epi32(__m128i __a, __m128i __b) -{ - return (__m128i)__builtin_ia32_psignd128((__v4si)__a, (__v4si)__b); -} - -/// For each 8-bit integer in the first source operand, perform one of -/// the following actions as specified by the second source operand. -/// -/// If the byte in the second source is negative, calculate the two's -/// complement of the corresponding byte in the first source, and write that -/// value to the destination. If the byte in the second source is positive, -/// copy the corresponding byte from the first source to the destination. If -/// the byte in the second source is zero, clear the corresponding byte in -/// the destination. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the \c PSIGNB instruction. -/// -/// \param __a -/// A 64-bit integer vector containing the values to be copied. -/// \param __b -/// A 64-bit integer vector containing control bytes corresponding to -/// positions in the destination. -/// \returns A 64-bit integer vector containing the resultant values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX -_mm_sign_pi8(__m64 __a, __m64 __b) -{ - return (__m64)__builtin_ia32_psignb((__v8qi)__a, (__v8qi)__b); -} - -/// For each 16-bit integer in the first source operand, perform one of -/// the following actions as specified by the second source operand. -/// -/// If the word in the second source is negative, calculate the two's -/// complement of the corresponding word in the first source, and write that -/// value to the destination. If the word in the second source is positive, -/// copy the corresponding word from the first source to the destination. If -/// the word in the second source is zero, clear the corresponding word in -/// the destination. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the \c PSIGNW instruction. -/// -/// \param __a -/// A 64-bit integer vector containing the values to be copied. -/// \param __b -/// A 64-bit integer vector containing control words corresponding to -/// positions in the destination. -/// \returns A 64-bit integer vector containing the resultant values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX -_mm_sign_pi16(__m64 __a, __m64 __b) -{ - return (__m64)__builtin_ia32_psignw((__v4hi)__a, (__v4hi)__b); -} - -/// For each 32-bit integer in the first source operand, perform one of -/// the following actions as specified by the second source operand. -/// -/// If the doubleword in the second source is negative, calculate the two's -/// complement of the corresponding doubleword in the first source, and -/// write that value to the destination. If the doubleword in the second -/// source is positive, copy the corresponding doubleword from the first -/// source to the destination. If the doubleword in the second source is -/// zero, clear the corresponding doubleword in the destination. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the \c PSIGND instruction. -/// -/// \param __a -/// A 64-bit integer vector containing the values to be copied. -/// \param __b -/// A 64-bit integer vector containing two control doublewords corresponding -/// to positions in the destination. -/// \returns A 64-bit integer vector containing the resultant values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX -_mm_sign_pi32(__m64 __a, __m64 __b) -{ - return (__m64)__builtin_ia32_psignd((__v2si)__a, (__v2si)__b); -} - -#undef __DEFAULT_FN_ATTRS -#undef __DEFAULT_FN_ATTRS_MMX - -#endif /* __TMMINTRIN_H */ diff --git a/include/tsxldtrkintrin.h b/include/tsxldtrkintrin.h deleted file mode 100644 index 491823e..0000000 --- a/include/tsxldtrkintrin.h +++ /dev/null @@ -1,56 +0,0 @@ -/*===------------- tsxldtrkintrin.h - tsxldtrk intrinsics ------------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ - -#ifndef __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __TSXLDTRKINTRIN_H -#define __TSXLDTRKINTRIN_H - -/* Define the default attributes for the functions in this file */ -#define _DEFAULT_FN_ATTRS \ - __attribute__((__always_inline__, __nodebug__, __target__("tsxldtrk"))) - -/// Marks the start of an TSX (RTM) suspend load address tracking region. If -/// this intrinsic is used inside a transactional region, subsequent loads -/// are not added to the read set of the transaction. If it's used inside a -/// suspend load address tracking region it will cause transaction abort. -/// If it's used outside of a transactional region it behaves like a NOP. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the \c XSUSLDTRK instruction. -/// -static __inline__ void _DEFAULT_FN_ATTRS -_xsusldtrk (void) -{ - __builtin_ia32_xsusldtrk(); -} - -/// Marks the end of an TSX (RTM) suspend load address tracking region. If this -/// intrinsic is used inside a suspend load address tracking region it will -/// end the suspend region and all following load addresses will be added to -/// the transaction read set. If it's used inside an active transaction but -/// not in a suspend region it will cause transaction abort. If it's used -/// outside of a transactional region it behaves like a NOP. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the \c XRESLDTRK instruction. -/// -static __inline__ void _DEFAULT_FN_ATTRS -_xresldtrk (void) -{ - __builtin_ia32_xresldtrk(); -} - -#undef _DEFAULT_FN_ATTRS - -#endif /* __TSXLDTRKINTRIN_H */ diff --git a/include/uintrintrin.h b/include/uintrintrin.h deleted file mode 100644 index e3839dc..0000000 --- a/include/uintrintrin.h +++ /dev/null @@ -1,157 +0,0 @@ -/*===------------------ uintrintrin.h - UINTR intrinsics -------------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ - -#ifndef __X86GPRINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __UINTRINTRIN_H -#define __UINTRINTRIN_H - -/* Define the default attributes for the functions in this file */ -#define __DEFAULT_FN_ATTRS \ - __attribute__((__always_inline__, __nodebug__, __target__("uintr"))) - -#ifdef __x86_64__ - -struct __uintr_frame -{ - unsigned long long rip; - unsigned long long rflags; - unsigned long long rsp; -}; - -/// Clears the user interrupt flag (UIF). Its effect takes place immediately: a -/// user interrupt cannot be delivered on the instruction boundary following -/// CLUI. Can be executed only if CR4.UINT = 1, the logical processor is in -/// 64-bit mode, and software is not executing inside an enclave; otherwise, -/// each causes an invalid-opcode exception. Causes a transactional abort if -/// executed inside a transactional region; the abort loads EAX as it would -/// had it been due to an execution of CLI. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the CLUI instruction. -/// -/// \operation -/// UIF := 0 -/// \endoperation -static __inline__ void __DEFAULT_FN_ATTRS -_clui (void) -{ - __builtin_ia32_clui(); -} - -/// Sets the user interrupt flag (UIF). Its effect takes place immediately; a -/// user interrupt may be delivered on the instruction boundary following -/// STUI. Can be executed only if CR4.UINT = 1, the logical processor is in -/// 64-bit mode, and software is not executing inside an enclave; otherwise, -/// each causes an invalid-opcode exception. Causes a transactional abort if -/// executed inside a transactional region; the abort loads EAX as it would -/// had it been due to an execution of STI. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the STUI instruction. -/// -/// \operation -/// UIF := 1 -/// \endoperation -static __inline__ void __DEFAULT_FN_ATTRS -_stui (void) -{ - __builtin_ia32_stui(); -} - -/// Get the current value of the user interrupt flag (UIF). Can be executed -/// regardless of CPL and inside a transactional region. Can be executed only -/// if CR4.UINT = 1, the logical processor is in 64-bit mode, and software is -/// not executing inside an enclave; otherwise, it causes an invalid-opcode -/// exception. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the TESTUI instruction. -/// -/// \returns The current value of the user interrupt flag (UIF). -/// -/// \operation -/// CF := UIF -/// ZF := 0 -/// AF := 0 -/// OF := 0 -/// PF := 0 -/// SF := 0 -/// dst := CF -/// \endoperation -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_testui (void) -{ - return __builtin_ia32_testui(); -} - -/// Send interprocessor user interrupt. Can be executed only if -/// CR4.UINT = IA32_UINT_TT[0] = 1, the logical processor is in 64-bit mode, -/// and software is not executing inside an enclave; otherwise, it causes an -/// invalid-opcode exception. May be executed at any privilege level, all of -/// its memory accesses are performed with supervisor privilege. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the SENDUIPI instruction -/// -/// \param __a -/// Index of user-interrupt target table entry in user-interrupt target -/// table. -/// -/// \operation -/// IF __a > UITTSZ -/// GP (0) -/// FI -/// tempUITTE := MEM[UITTADDR + (a<<4)] -/// // tempUITTE must be valid, and can't have any reserved bit set -/// IF (tempUITTE.V == 0 OR tempUITTE[7:1] != 0) -/// GP (0) -/// FI -/// tempUPID := MEM[tempUITTE.UPIDADDR] // under lock -/// // tempUPID can't have any reserved bit set -/// IF (tempUPID[15:2] != 0 OR tempUPID[31:24] != 0) -/// GP (0) // release lock -/// FI -/// tempUPID.PIR[tempUITTE.UV] := 1; -/// IF (tempUPID.SN == 0 AND tempUPID.ON == 0) -/// tempUPID.ON := 1 -/// sendNotify := 1 -/// ELSE -/// sendNotify := 0 -/// FI -/// MEM[tempUITTE.UPIDADDR] := tempUPID // release lock -/// IF sendNotify == 1 -/// IF IA32_APIC_BASE[10] == 1 // local APIC is in x2APIC mode -/// // send ordinary IPI with vector tempUPID.NV to 32-bit physical APIC -/// // ID tempUPID.NDST -/// SendOrdinaryIPI(tempUPID.NV, tempUPID.NDST) -/// ELSE -/// // send ordinary IPI with vector tempUPID.NV to 8-bit physical APIC -/// // ID tempUPID.NDST[15:8] -/// SendOrdinaryIPI(tempUPID.NV, tempUPID.NDST[15:8]) -/// FI -/// FI -/// \endoperation -static __inline__ void __DEFAULT_FN_ATTRS -_senduipi (unsigned long long __a) -{ - __builtin_ia32_senduipi(__a); -} - -#endif /* __x86_64__ */ - -#undef __DEFAULT_FN_ATTRS - -#endif /* __UINTRINTRIN_H */ diff --git a/include/vaesintrin.h b/include/vaesintrin.h deleted file mode 100644 index 294dcff..0000000 --- a/include/vaesintrin.h +++ /dev/null @@ -1,85 +0,0 @@ -/*===------------------ vaesintrin.h - VAES intrinsics ---------------------=== - * - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ -#ifndef __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __VAESINTRIN_H -#define __VAESINTRIN_H - -/* Default attributes for YMM forms. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("vaes"), __min_vector_width__(256))) - -/* Default attributes for ZMM forms. */ -#define __DEFAULT_FN_ATTRS_F __attribute__((__always_inline__, __nodebug__, __target__("avx512f,vaes"), __min_vector_width__(512))) - - -static __inline__ __m256i __DEFAULT_FN_ATTRS - _mm256_aesenc_epi128(__m256i __A, __m256i __B) -{ - return (__m256i) __builtin_ia32_aesenc256((__v4di) __A, - (__v4di) __B); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS - _mm256_aesdec_epi128(__m256i __A, __m256i __B) -{ - return (__m256i) __builtin_ia32_aesdec256((__v4di) __A, - (__v4di) __B); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS - _mm256_aesenclast_epi128(__m256i __A, __m256i __B) -{ - return (__m256i) __builtin_ia32_aesenclast256((__v4di) __A, - (__v4di) __B); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS - _mm256_aesdeclast_epi128(__m256i __A, __m256i __B) -{ - return (__m256i) __builtin_ia32_aesdeclast256((__v4di) __A, - (__v4di) __B); -} - -#ifdef __AVX512FINTRIN_H -static __inline__ __m512i __DEFAULT_FN_ATTRS_F - _mm512_aesenc_epi128(__m512i __A, __m512i __B) -{ - return (__m512i) __builtin_ia32_aesenc512((__v8di) __A, - (__v8di) __B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS_F - _mm512_aesdec_epi128(__m512i __A, __m512i __B) -{ - return (__m512i) __builtin_ia32_aesdec512((__v8di) __A, - (__v8di) __B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS_F - _mm512_aesenclast_epi128(__m512i __A, __m512i __B) -{ - return (__m512i) __builtin_ia32_aesenclast512((__v8di) __A, - (__v8di) __B); -} - -static __inline__ __m512i __DEFAULT_FN_ATTRS_F - _mm512_aesdeclast_epi128(__m512i __A, __m512i __B) -{ - return (__m512i) __builtin_ia32_aesdeclast512((__v8di) __A, - (__v8di) __B); -} -#endif // __AVX512FINTRIN_H - -#undef __DEFAULT_FN_ATTRS -#undef __DEFAULT_FN_ATTRS_F - -#endif // __VAESINTRIN_H diff --git a/include/vpclmulqdqintrin.h b/include/vpclmulqdqintrin.h deleted file mode 100644 index 485692e..0000000 --- a/include/vpclmulqdqintrin.h +++ /dev/null @@ -1,30 +0,0 @@ -/*===------------ vpclmulqdqintrin.h - VPCLMULQDQ intrinsics ---------------=== - * - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ -#ifndef __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __VPCLMULQDQINTRIN_H -#define __VPCLMULQDQINTRIN_H - -#define _mm256_clmulepi64_epi128(A, B, I) \ - ((__m256i)__builtin_ia32_pclmulqdq256((__v4di)(__m256i)(A), \ - (__v4di)(__m256i)(B), \ - (char)(I))) - -#ifdef __AVX512FINTRIN_H -#define _mm512_clmulepi64_epi128(A, B, I) \ - ((__m512i)__builtin_ia32_pclmulqdq512((__v8di)(__m512i)(A), \ - (__v8di)(__m512i)(B), \ - (char)(I))) -#endif // __AVX512FINTRIN_H - -#endif /* __VPCLMULQDQINTRIN_H */ - diff --git a/include/waitpkgintrin.h b/include/waitpkgintrin.h deleted file mode 100644 index 7ecada4..0000000 --- a/include/waitpkgintrin.h +++ /dev/null @@ -1,42 +0,0 @@ -/*===----------------------- waitpkgintrin.h - WAITPKG --------------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ -#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __WAITPKGINTRIN_H -#define __WAITPKGINTRIN_H - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS \ - __attribute__((__always_inline__, __nodebug__, __target__("waitpkg"))) - -static __inline__ void __DEFAULT_FN_ATTRS -_umonitor (void * __address) -{ - __builtin_ia32_umonitor (__address); -} - -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_umwait (unsigned int __control, unsigned long long __counter) -{ - return __builtin_ia32_umwait (__control, - (unsigned int)(__counter >> 32), (unsigned int)__counter); -} - -static __inline__ unsigned char __DEFAULT_FN_ATTRS -_tpause (unsigned int __control, unsigned long long __counter) -{ - return __builtin_ia32_tpause (__control, - (unsigned int)(__counter >> 32), (unsigned int)__counter); -} - -#undef __DEFAULT_FN_ATTRS - -#endif /* __WAITPKGINTRIN_H */ diff --git a/include/wbnoinvdintrin.h b/include/wbnoinvdintrin.h deleted file mode 100644 index cac0347..0000000 --- a/include/wbnoinvdintrin.h +++ /dev/null @@ -1,24 +0,0 @@ -/*===-------------- wbnoinvdintrin.h - wbnoinvd intrinsic-------------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ - -#if !defined __X86INTRIN_H && !defined __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __WBNOINVDINTRIN_H -#define __WBNOINVDINTRIN_H - -static __inline__ void - __attribute__((__always_inline__, __nodebug__, __target__("wbnoinvd"))) -_wbnoinvd (void) -{ - __builtin_ia32_wbnoinvd (); -} - -#endif /* __WBNOINVDINTRIN_H */ diff --git a/include/wmmintrin.h b/include/wmmintrin.h deleted file mode 100644 index 49148db..0000000 --- a/include/wmmintrin.h +++ /dev/null @@ -1,23 +0,0 @@ -/*===---- wmmintrin.h - AES intrinsics ------------------------------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ - -#ifndef __WMMINTRIN_H -#define __WMMINTRIN_H - -#if !defined(__i386__) && !defined(__x86_64__) -#error "This header is only meant to be used on x86 and x64 architecture" -#endif - -#include - -#include <__wmmintrin_aes.h> - -#include <__wmmintrin_pclmul.h> - -#endif /* __WMMINTRIN_H */ diff --git a/include/x86gprintrin.h b/include/x86gprintrin.h deleted file mode 100644 index 01e741f..0000000 --- a/include/x86gprintrin.h +++ /dev/null @@ -1,35 +0,0 @@ -/*===--------------- x86gprintrin.h - X86 GPR intrinsics ------------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ - -#ifndef __X86GPRINTRIN_H -#define __X86GPRINTRIN_H - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__HRESET__) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__UINTR__) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__CRC32__) -#include -#endif - -#define __SSC_MARK(Tag) \ - __asm__ __volatile__("mov {%%ebx, %%eax|eax, ebx}; " \ - "mov {%0, %%ebx|ebx, %0}; " \ - ".byte 0x64, 0x67, 0x90; " \ - "mov {%%eax, %%ebx|ebx, eax};" ::"i"(Tag) \ - : "%eax"); - -#endif /* __X86GPRINTRIN_H */ diff --git a/include/x86intrin.h b/include/x86intrin.h deleted file mode 100644 index 768d0e5..0000000 --- a/include/x86intrin.h +++ /dev/null @@ -1,63 +0,0 @@ -/*===---- x86intrin.h - X86 intrinsics -------------------------------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ - -#ifndef __X86INTRIN_H -#define __X86INTRIN_H - -#include - -#include - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__3dNOW__) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__PRFCHW__) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__SSE4A__) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__FMA4__) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__XOP__) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__TBM__) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__LWP__) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__MWAITX__) -#include -#endif - -#if !(defined(_MSC_VER) || defined(__SCE__)) || __has_feature(modules) || \ - defined(__CLZERO__) -#include -#endif - - -#endif /* __X86INTRIN_H */ diff --git a/include/xmmintrin.h b/include/xmmintrin.h deleted file mode 100644 index 1612d3d..0000000 --- a/include/xmmintrin.h +++ /dev/null @@ -1,3012 +0,0 @@ -/*===---- xmmintrin.h - SSE intrinsics -------------------------------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ - -#ifndef __XMMINTRIN_H -#define __XMMINTRIN_H - -#if !defined(__i386__) && !defined(__x86_64__) -#error "This header is only meant to be used on x86 and x64 architecture" -#endif - -#include - -typedef int __v4si __attribute__((__vector_size__(16))); -typedef float __v4sf __attribute__((__vector_size__(16))); -typedef float __m128 __attribute__((__vector_size__(16), __aligned__(16))); - -typedef float __m128_u __attribute__((__vector_size__(16), __aligned__(1))); - -/* Unsigned types */ -typedef unsigned int __v4su __attribute__((__vector_size__(16))); - -/* This header should only be included in a hosted environment as it depends on - * a standard library to provide allocation routines. */ -#if __STDC_HOSTED__ -#include -#endif - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("sse"), __min_vector_width__(128))) -#define __DEFAULT_FN_ATTRS_MMX __attribute__((__always_inline__, __nodebug__, __target__("mmx,sse"), __min_vector_width__(64))) - -/// Adds the 32-bit float values in the low-order bits of the operands. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VADDSS / ADDSS instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float] containing one of the source operands. -/// The lower 32 bits of this operand are used in the calculation. -/// \param __b -/// A 128-bit vector of [4 x float] containing one of the source operands. -/// The lower 32 bits of this operand are used in the calculation. -/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the sum -/// of the lower 32 bits of both operands. The upper 96 bits are copied from -/// the upper 96 bits of the first source operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_add_ss(__m128 __a, __m128 __b) -{ - __a[0] += __b[0]; - return __a; -} - -/// Adds two 128-bit vectors of [4 x float], and returns the results of -/// the addition. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VADDPS / ADDPS instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float] containing one of the source operands. -/// \param __b -/// A 128-bit vector of [4 x float] containing one of the source operands. -/// \returns A 128-bit vector of [4 x float] containing the sums of both -/// operands. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_add_ps(__m128 __a, __m128 __b) -{ - return (__m128)((__v4sf)__a + (__v4sf)__b); -} - -/// Subtracts the 32-bit float value in the low-order bits of the second -/// operand from the corresponding value in the first operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VSUBSS / SUBSS instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float] containing the minuend. The lower 32 bits -/// of this operand are used in the calculation. -/// \param __b -/// A 128-bit vector of [4 x float] containing the subtrahend. The lower 32 -/// bits of this operand are used in the calculation. -/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the -/// difference of the lower 32 bits of both operands. The upper 96 bits are -/// copied from the upper 96 bits of the first source operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_sub_ss(__m128 __a, __m128 __b) -{ - __a[0] -= __b[0]; - return __a; -} - -/// Subtracts each of the values of the second operand from the first -/// operand, both of which are 128-bit vectors of [4 x float] and returns -/// the results of the subtraction. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VSUBPS / SUBPS instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float] containing the minuend. -/// \param __b -/// A 128-bit vector of [4 x float] containing the subtrahend. -/// \returns A 128-bit vector of [4 x float] containing the differences between -/// both operands. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_sub_ps(__m128 __a, __m128 __b) -{ - return (__m128)((__v4sf)__a - (__v4sf)__b); -} - -/// Multiplies two 32-bit float values in the low-order bits of the -/// operands. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMULSS / MULSS instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float] containing one of the source operands. -/// The lower 32 bits of this operand are used in the calculation. -/// \param __b -/// A 128-bit vector of [4 x float] containing one of the source operands. -/// The lower 32 bits of this operand are used in the calculation. -/// \returns A 128-bit vector of [4 x float] containing the product of the lower -/// 32 bits of both operands. The upper 96 bits are copied from the upper 96 -/// bits of the first source operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_mul_ss(__m128 __a, __m128 __b) -{ - __a[0] *= __b[0]; - return __a; -} - -/// Multiplies two 128-bit vectors of [4 x float] and returns the -/// results of the multiplication. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMULPS / MULPS instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float] containing one of the source operands. -/// \param __b -/// A 128-bit vector of [4 x float] containing one of the source operands. -/// \returns A 128-bit vector of [4 x float] containing the products of both -/// operands. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_mul_ps(__m128 __a, __m128 __b) -{ - return (__m128)((__v4sf)__a * (__v4sf)__b); -} - -/// Divides the value in the low-order 32 bits of the first operand by -/// the corresponding value in the second operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VDIVSS / DIVSS instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float] containing the dividend. The lower 32 -/// bits of this operand are used in the calculation. -/// \param __b -/// A 128-bit vector of [4 x float] containing the divisor. The lower 32 bits -/// of this operand are used in the calculation. -/// \returns A 128-bit vector of [4 x float] containing the quotients of the -/// lower 32 bits of both operands. The upper 96 bits are copied from the -/// upper 96 bits of the first source operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_div_ss(__m128 __a, __m128 __b) -{ - __a[0] /= __b[0]; - return __a; -} - -/// Divides two 128-bit vectors of [4 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VDIVPS / DIVPS instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float] containing the dividend. -/// \param __b -/// A 128-bit vector of [4 x float] containing the divisor. -/// \returns A 128-bit vector of [4 x float] containing the quotients of both -/// operands. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_div_ps(__m128 __a, __m128 __b) -{ - return (__m128)((__v4sf)__a / (__v4sf)__b); -} - -/// Calculates the square root of the value stored in the low-order bits -/// of a 128-bit vector of [4 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VSQRTSS / SQRTSS instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are -/// used in the calculation. -/// \returns A 128-bit vector of [4 x float] containing the square root of the -/// value in the low-order bits of the operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_sqrt_ss(__m128 __a) -{ - return (__m128)__builtin_ia32_sqrtss((__v4sf)__a); -} - -/// Calculates the square roots of the values stored in a 128-bit vector -/// of [4 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VSQRTPS / SQRTPS instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. -/// \returns A 128-bit vector of [4 x float] containing the square roots of the -/// values in the operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_sqrt_ps(__m128 __a) -{ - return __builtin_ia32_sqrtps((__v4sf)__a); -} - -/// Calculates the approximate reciprocal of the value stored in the -/// low-order bits of a 128-bit vector of [4 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VRCPSS / RCPSS instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are -/// used in the calculation. -/// \returns A 128-bit vector of [4 x float] containing the approximate -/// reciprocal of the value in the low-order bits of the operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_rcp_ss(__m128 __a) -{ - return (__m128)__builtin_ia32_rcpss((__v4sf)__a); -} - -/// Calculates the approximate reciprocals of the values stored in a -/// 128-bit vector of [4 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VRCPPS / RCPPS instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. -/// \returns A 128-bit vector of [4 x float] containing the approximate -/// reciprocals of the values in the operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_rcp_ps(__m128 __a) -{ - return (__m128)__builtin_ia32_rcpps((__v4sf)__a); -} - -/// Calculates the approximate reciprocal of the square root of the value -/// stored in the low-order bits of a 128-bit vector of [4 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VRSQRTSS / RSQRTSS instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are -/// used in the calculation. -/// \returns A 128-bit vector of [4 x float] containing the approximate -/// reciprocal of the square root of the value in the low-order bits of the -/// operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_rsqrt_ss(__m128 __a) -{ - return __builtin_ia32_rsqrtss((__v4sf)__a); -} - -/// Calculates the approximate reciprocals of the square roots of the -/// values stored in a 128-bit vector of [4 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VRSQRTPS / RSQRTPS instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. -/// \returns A 128-bit vector of [4 x float] containing the approximate -/// reciprocals of the square roots of the values in the operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_rsqrt_ps(__m128 __a) -{ - return __builtin_ia32_rsqrtps((__v4sf)__a); -} - -/// Compares two 32-bit float values in the low-order bits of both -/// operands and returns the lesser value in the low-order bits of the -/// vector of [4 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMINSS / MINSS instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float] containing one of the operands. The lower -/// 32 bits of this operand are used in the comparison. -/// \param __b -/// A 128-bit vector of [4 x float] containing one of the operands. The lower -/// 32 bits of this operand are used in the comparison. -/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the -/// minimum value between both operands. The upper 96 bits are copied from -/// the upper 96 bits of the first source operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_min_ss(__m128 __a, __m128 __b) -{ - return __builtin_ia32_minss((__v4sf)__a, (__v4sf)__b); -} - -/// Compares two 128-bit vectors of [4 x float] and returns the lesser -/// of each pair of values. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMINPS / MINPS instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float] containing one of the operands. -/// \param __b -/// A 128-bit vector of [4 x float] containing one of the operands. -/// \returns A 128-bit vector of [4 x float] containing the minimum values -/// between both operands. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_min_ps(__m128 __a, __m128 __b) -{ - return __builtin_ia32_minps((__v4sf)__a, (__v4sf)__b); -} - -/// Compares two 32-bit float values in the low-order bits of both -/// operands and returns the greater value in the low-order bits of a 128-bit -/// vector of [4 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMAXSS / MAXSS instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float] containing one of the operands. The lower -/// 32 bits of this operand are used in the comparison. -/// \param __b -/// A 128-bit vector of [4 x float] containing one of the operands. The lower -/// 32 bits of this operand are used in the comparison. -/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the -/// maximum value between both operands. The upper 96 bits are copied from -/// the upper 96 bits of the first source operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_max_ss(__m128 __a, __m128 __b) -{ - return __builtin_ia32_maxss((__v4sf)__a, (__v4sf)__b); -} - -/// Compares two 128-bit vectors of [4 x float] and returns the greater -/// of each pair of values. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMAXPS / MAXPS instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float] containing one of the operands. -/// \param __b -/// A 128-bit vector of [4 x float] containing one of the operands. -/// \returns A 128-bit vector of [4 x float] containing the maximum values -/// between both operands. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_max_ps(__m128 __a, __m128 __b) -{ - return __builtin_ia32_maxps((__v4sf)__a, (__v4sf)__b); -} - -/// Performs a bitwise AND of two 128-bit vectors of [4 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VANDPS / ANDPS instructions. -/// -/// \param __a -/// A 128-bit vector containing one of the source operands. -/// \param __b -/// A 128-bit vector containing one of the source operands. -/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the -/// values between both operands. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_and_ps(__m128 __a, __m128 __b) -{ - return (__m128)((__v4su)__a & (__v4su)__b); -} - -/// Performs a bitwise AND of two 128-bit vectors of [4 x float], using -/// the one's complement of the values contained in the first source -/// operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VANDNPS / ANDNPS instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float] containing the first source operand. The -/// one's complement of this value is used in the bitwise AND. -/// \param __b -/// A 128-bit vector of [4 x float] containing the second source operand. -/// \returns A 128-bit vector of [4 x float] containing the bitwise AND of the -/// one's complement of the first operand and the values in the second -/// operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_andnot_ps(__m128 __a, __m128 __b) -{ - return (__m128)(~(__v4su)__a & (__v4su)__b); -} - -/// Performs a bitwise OR of two 128-bit vectors of [4 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VORPS / ORPS instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float] containing one of the source operands. -/// \param __b -/// A 128-bit vector of [4 x float] containing one of the source operands. -/// \returns A 128-bit vector of [4 x float] containing the bitwise OR of the -/// values between both operands. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_or_ps(__m128 __a, __m128 __b) -{ - return (__m128)((__v4su)__a | (__v4su)__b); -} - -/// Performs a bitwise exclusive OR of two 128-bit vectors of -/// [4 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VXORPS / XORPS instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float] containing one of the source operands. -/// \param __b -/// A 128-bit vector of [4 x float] containing one of the source operands. -/// \returns A 128-bit vector of [4 x float] containing the bitwise exclusive OR -/// of the values between both operands. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_xor_ps(__m128 __a, __m128 __b) -{ - return (__m128)((__v4su)__a ^ (__v4su)__b); -} - -/// Compares two 32-bit float values in the low-order bits of both -/// operands for equality and returns the result of the comparison in the -/// low-order bits of a vector [4 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCMPEQSS / CMPEQSS instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float] containing one of the operands. The lower -/// 32 bits of this operand are used in the comparison. -/// \param __b -/// A 128-bit vector of [4 x float] containing one of the operands. The lower -/// 32 bits of this operand are used in the comparison. -/// \returns A 128-bit vector of [4 x float] containing the comparison results -/// in the low-order bits. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_cmpeq_ss(__m128 __a, __m128 __b) -{ - return (__m128)__builtin_ia32_cmpeqss((__v4sf)__a, (__v4sf)__b); -} - -/// Compares each of the corresponding 32-bit float values of the -/// 128-bit vectors of [4 x float] for equality. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCMPEQPS / CMPEQPS instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. -/// \param __b -/// A 128-bit vector of [4 x float]. -/// \returns A 128-bit vector of [4 x float] containing the comparison results. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_cmpeq_ps(__m128 __a, __m128 __b) -{ - return (__m128)__builtin_ia32_cmpeqps((__v4sf)__a, (__v4sf)__b); -} - -/// Compares two 32-bit float values in the low-order bits of both -/// operands to determine if the value in the first operand is less than the -/// corresponding value in the second operand and returns the result of the -/// comparison in the low-order bits of a vector of [4 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCMPLTSS / CMPLTSS instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float] containing one of the operands. The lower -/// 32 bits of this operand are used in the comparison. -/// \param __b -/// A 128-bit vector of [4 x float] containing one of the operands. The lower -/// 32 bits of this operand are used in the comparison. -/// \returns A 128-bit vector of [4 x float] containing the comparison results -/// in the low-order bits. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_cmplt_ss(__m128 __a, __m128 __b) -{ - return (__m128)__builtin_ia32_cmpltss((__v4sf)__a, (__v4sf)__b); -} - -/// Compares each of the corresponding 32-bit float values of the -/// 128-bit vectors of [4 x float] to determine if the values in the first -/// operand are less than those in the second operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCMPLTPS / CMPLTPS instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. -/// \param __b -/// A 128-bit vector of [4 x float]. -/// \returns A 128-bit vector of [4 x float] containing the comparison results. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_cmplt_ps(__m128 __a, __m128 __b) -{ - return (__m128)__builtin_ia32_cmpltps((__v4sf)__a, (__v4sf)__b); -} - -/// Compares two 32-bit float values in the low-order bits of both -/// operands to determine if the value in the first operand is less than or -/// equal to the corresponding value in the second operand and returns the -/// result of the comparison in the low-order bits of a vector of -/// [4 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCMPLESS / CMPLESS instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float] containing one of the operands. The lower -/// 32 bits of this operand are used in the comparison. -/// \param __b -/// A 128-bit vector of [4 x float] containing one of the operands. The lower -/// 32 bits of this operand are used in the comparison. -/// \returns A 128-bit vector of [4 x float] containing the comparison results -/// in the low-order bits. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_cmple_ss(__m128 __a, __m128 __b) -{ - return (__m128)__builtin_ia32_cmpless((__v4sf)__a, (__v4sf)__b); -} - -/// Compares each of the corresponding 32-bit float values of the -/// 128-bit vectors of [4 x float] to determine if the values in the first -/// operand are less than or equal to those in the second operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCMPLEPS / CMPLEPS instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. -/// \param __b -/// A 128-bit vector of [4 x float]. -/// \returns A 128-bit vector of [4 x float] containing the comparison results. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_cmple_ps(__m128 __a, __m128 __b) -{ - return (__m128)__builtin_ia32_cmpleps((__v4sf)__a, (__v4sf)__b); -} - -/// Compares two 32-bit float values in the low-order bits of both -/// operands to determine if the value in the first operand is greater than -/// the corresponding value in the second operand and returns the result of -/// the comparison in the low-order bits of a vector of [4 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCMPLTSS / CMPLTSS instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float] containing one of the operands. The lower -/// 32 bits of this operand are used in the comparison. -/// \param __b -/// A 128-bit vector of [4 x float] containing one of the operands. The lower -/// 32 bits of this operand are used in the comparison. -/// \returns A 128-bit vector of [4 x float] containing the comparison results -/// in the low-order bits. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_cmpgt_ss(__m128 __a, __m128 __b) -{ - return (__m128)__builtin_shufflevector((__v4sf)__a, - (__v4sf)__builtin_ia32_cmpltss((__v4sf)__b, (__v4sf)__a), - 4, 1, 2, 3); -} - -/// Compares each of the corresponding 32-bit float values of the -/// 128-bit vectors of [4 x float] to determine if the values in the first -/// operand are greater than those in the second operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCMPLTPS / CMPLTPS instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. -/// \param __b -/// A 128-bit vector of [4 x float]. -/// \returns A 128-bit vector of [4 x float] containing the comparison results. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_cmpgt_ps(__m128 __a, __m128 __b) -{ - return (__m128)__builtin_ia32_cmpltps((__v4sf)__b, (__v4sf)__a); -} - -/// Compares two 32-bit float values in the low-order bits of both -/// operands to determine if the value in the first operand is greater than -/// or equal to the corresponding value in the second operand and returns -/// the result of the comparison in the low-order bits of a vector of -/// [4 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCMPLESS / CMPLESS instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float] containing one of the operands. The lower -/// 32 bits of this operand are used in the comparison. -/// \param __b -/// A 128-bit vector of [4 x float] containing one of the operands. The lower -/// 32 bits of this operand are used in the comparison. -/// \returns A 128-bit vector of [4 x float] containing the comparison results -/// in the low-order bits. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_cmpge_ss(__m128 __a, __m128 __b) -{ - return (__m128)__builtin_shufflevector((__v4sf)__a, - (__v4sf)__builtin_ia32_cmpless((__v4sf)__b, (__v4sf)__a), - 4, 1, 2, 3); -} - -/// Compares each of the corresponding 32-bit float values of the -/// 128-bit vectors of [4 x float] to determine if the values in the first -/// operand are greater than or equal to those in the second operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCMPLEPS / CMPLEPS instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. -/// \param __b -/// A 128-bit vector of [4 x float]. -/// \returns A 128-bit vector of [4 x float] containing the comparison results. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_cmpge_ps(__m128 __a, __m128 __b) -{ - return (__m128)__builtin_ia32_cmpleps((__v4sf)__b, (__v4sf)__a); -} - -/// Compares two 32-bit float values in the low-order bits of both -/// operands for inequality and returns the result of the comparison in the -/// low-order bits of a vector of [4 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCMPNEQSS / CMPNEQSS -/// instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float] containing one of the operands. The lower -/// 32 bits of this operand are used in the comparison. -/// \param __b -/// A 128-bit vector of [4 x float] containing one of the operands. The lower -/// 32 bits of this operand are used in the comparison. -/// \returns A 128-bit vector of [4 x float] containing the comparison results -/// in the low-order bits. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_cmpneq_ss(__m128 __a, __m128 __b) -{ - return (__m128)__builtin_ia32_cmpneqss((__v4sf)__a, (__v4sf)__b); -} - -/// Compares each of the corresponding 32-bit float values of the -/// 128-bit vectors of [4 x float] for inequality. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCMPNEQPS / CMPNEQPS -/// instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. -/// \param __b -/// A 128-bit vector of [4 x float]. -/// \returns A 128-bit vector of [4 x float] containing the comparison results. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_cmpneq_ps(__m128 __a, __m128 __b) -{ - return (__m128)__builtin_ia32_cmpneqps((__v4sf)__a, (__v4sf)__b); -} - -/// Compares two 32-bit float values in the low-order bits of both -/// operands to determine if the value in the first operand is not less than -/// the corresponding value in the second operand and returns the result of -/// the comparison in the low-order bits of a vector of [4 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCMPNLTSS / CMPNLTSS -/// instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float] containing one of the operands. The lower -/// 32 bits of this operand are used in the comparison. -/// \param __b -/// A 128-bit vector of [4 x float] containing one of the operands. The lower -/// 32 bits of this operand are used in the comparison. -/// \returns A 128-bit vector of [4 x float] containing the comparison results -/// in the low-order bits. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_cmpnlt_ss(__m128 __a, __m128 __b) -{ - return (__m128)__builtin_ia32_cmpnltss((__v4sf)__a, (__v4sf)__b); -} - -/// Compares each of the corresponding 32-bit float values of the -/// 128-bit vectors of [4 x float] to determine if the values in the first -/// operand are not less than those in the second operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCMPNLTPS / CMPNLTPS -/// instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. -/// \param __b -/// A 128-bit vector of [4 x float]. -/// \returns A 128-bit vector of [4 x float] containing the comparison results. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_cmpnlt_ps(__m128 __a, __m128 __b) -{ - return (__m128)__builtin_ia32_cmpnltps((__v4sf)__a, (__v4sf)__b); -} - -/// Compares two 32-bit float values in the low-order bits of both -/// operands to determine if the value in the first operand is not less than -/// or equal to the corresponding value in the second operand and returns -/// the result of the comparison in the low-order bits of a vector of -/// [4 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCMPNLESS / CMPNLESS -/// instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float] containing one of the operands. The lower -/// 32 bits of this operand are used in the comparison. -/// \param __b -/// A 128-bit vector of [4 x float] containing one of the operands. The lower -/// 32 bits of this operand are used in the comparison. -/// \returns A 128-bit vector of [4 x float] containing the comparison results -/// in the low-order bits. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_cmpnle_ss(__m128 __a, __m128 __b) -{ - return (__m128)__builtin_ia32_cmpnless((__v4sf)__a, (__v4sf)__b); -} - -/// Compares each of the corresponding 32-bit float values of the -/// 128-bit vectors of [4 x float] to determine if the values in the first -/// operand are not less than or equal to those in the second operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCMPNLEPS / CMPNLEPS -/// instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. -/// \param __b -/// A 128-bit vector of [4 x float]. -/// \returns A 128-bit vector of [4 x float] containing the comparison results. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_cmpnle_ps(__m128 __a, __m128 __b) -{ - return (__m128)__builtin_ia32_cmpnleps((__v4sf)__a, (__v4sf)__b); -} - -/// Compares two 32-bit float values in the low-order bits of both -/// operands to determine if the value in the first operand is not greater -/// than the corresponding value in the second operand and returns the -/// result of the comparison in the low-order bits of a vector of -/// [4 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCMPNLTSS / CMPNLTSS -/// instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float] containing one of the operands. The lower -/// 32 bits of this operand are used in the comparison. -/// \param __b -/// A 128-bit vector of [4 x float] containing one of the operands. The lower -/// 32 bits of this operand are used in the comparison. -/// \returns A 128-bit vector of [4 x float] containing the comparison results -/// in the low-order bits. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_cmpngt_ss(__m128 __a, __m128 __b) -{ - return (__m128)__builtin_shufflevector((__v4sf)__a, - (__v4sf)__builtin_ia32_cmpnltss((__v4sf)__b, (__v4sf)__a), - 4, 1, 2, 3); -} - -/// Compares each of the corresponding 32-bit float values of the -/// 128-bit vectors of [4 x float] to determine if the values in the first -/// operand are not greater than those in the second operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCMPNLTPS / CMPNLTPS -/// instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. -/// \param __b -/// A 128-bit vector of [4 x float]. -/// \returns A 128-bit vector of [4 x float] containing the comparison results. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_cmpngt_ps(__m128 __a, __m128 __b) -{ - return (__m128)__builtin_ia32_cmpnltps((__v4sf)__b, (__v4sf)__a); -} - -/// Compares two 32-bit float values in the low-order bits of both -/// operands to determine if the value in the first operand is not greater -/// than or equal to the corresponding value in the second operand and -/// returns the result of the comparison in the low-order bits of a vector -/// of [4 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCMPNLESS / CMPNLESS -/// instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float] containing one of the operands. The lower -/// 32 bits of this operand are used in the comparison. -/// \param __b -/// A 128-bit vector of [4 x float] containing one of the operands. The lower -/// 32 bits of this operand are used in the comparison. -/// \returns A 128-bit vector of [4 x float] containing the comparison results -/// in the low-order bits. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_cmpnge_ss(__m128 __a, __m128 __b) -{ - return (__m128)__builtin_shufflevector((__v4sf)__a, - (__v4sf)__builtin_ia32_cmpnless((__v4sf)__b, (__v4sf)__a), - 4, 1, 2, 3); -} - -/// Compares each of the corresponding 32-bit float values of the -/// 128-bit vectors of [4 x float] to determine if the values in the first -/// operand are not greater than or equal to those in the second operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCMPNLEPS / CMPNLEPS -/// instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. -/// \param __b -/// A 128-bit vector of [4 x float]. -/// \returns A 128-bit vector of [4 x float] containing the comparison results. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_cmpnge_ps(__m128 __a, __m128 __b) -{ - return (__m128)__builtin_ia32_cmpnleps((__v4sf)__b, (__v4sf)__a); -} - -/// Compares two 32-bit float values in the low-order bits of both -/// operands to determine if the value in the first operand is ordered with -/// respect to the corresponding value in the second operand and returns the -/// result of the comparison in the low-order bits of a vector of -/// [4 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCMPORDSS / CMPORDSS -/// instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float] containing one of the operands. The lower -/// 32 bits of this operand are used in the comparison. -/// \param __b -/// A 128-bit vector of [4 x float] containing one of the operands. The lower -/// 32 bits of this operand are used in the comparison. -/// \returns A 128-bit vector of [4 x float] containing the comparison results -/// in the low-order bits. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_cmpord_ss(__m128 __a, __m128 __b) -{ - return (__m128)__builtin_ia32_cmpordss((__v4sf)__a, (__v4sf)__b); -} - -/// Compares each of the corresponding 32-bit float values of the -/// 128-bit vectors of [4 x float] to determine if the values in the first -/// operand are ordered with respect to those in the second operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCMPORDPS / CMPORDPS -/// instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. -/// \param __b -/// A 128-bit vector of [4 x float]. -/// \returns A 128-bit vector of [4 x float] containing the comparison results. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_cmpord_ps(__m128 __a, __m128 __b) -{ - return (__m128)__builtin_ia32_cmpordps((__v4sf)__a, (__v4sf)__b); -} - -/// Compares two 32-bit float values in the low-order bits of both -/// operands to determine if the value in the first operand is unordered -/// with respect to the corresponding value in the second operand and -/// returns the result of the comparison in the low-order bits of a vector -/// of [4 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCMPUNORDSS / CMPUNORDSS -/// instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float] containing one of the operands. The lower -/// 32 bits of this operand are used in the comparison. -/// \param __b -/// A 128-bit vector of [4 x float] containing one of the operands. The lower -/// 32 bits of this operand are used in the comparison. -/// \returns A 128-bit vector of [4 x float] containing the comparison results -/// in the low-order bits. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_cmpunord_ss(__m128 __a, __m128 __b) -{ - return (__m128)__builtin_ia32_cmpunordss((__v4sf)__a, (__v4sf)__b); -} - -/// Compares each of the corresponding 32-bit float values of the -/// 128-bit vectors of [4 x float] to determine if the values in the first -/// operand are unordered with respect to those in the second operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCMPUNORDPS / CMPUNORDPS -/// instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. -/// \param __b -/// A 128-bit vector of [4 x float]. -/// \returns A 128-bit vector of [4 x float] containing the comparison results. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_cmpunord_ps(__m128 __a, __m128 __b) -{ - return (__m128)__builtin_ia32_cmpunordps((__v4sf)__a, (__v4sf)__b); -} - -/// Compares two 32-bit float values in the low-order bits of both -/// operands for equality and returns the result of the comparison. -/// -/// If either of the two lower 32-bit values is NaN, 0 is returned. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCOMISS / COMISS -/// instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are -/// used in the comparison. -/// \param __b -/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are -/// used in the comparison. -/// \returns An integer containing the comparison results. If either of the -/// two lower 32-bit values is NaN, 0 is returned. -static __inline__ int __DEFAULT_FN_ATTRS -_mm_comieq_ss(__m128 __a, __m128 __b) -{ - return __builtin_ia32_comieq((__v4sf)__a, (__v4sf)__b); -} - -/// Compares two 32-bit float values in the low-order bits of both -/// operands to determine if the first operand is less than the second -/// operand and returns the result of the comparison. -/// -/// If either of the two lower 32-bit values is NaN, 0 is returned. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCOMISS / COMISS -/// instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are -/// used in the comparison. -/// \param __b -/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are -/// used in the comparison. -/// \returns An integer containing the comparison results. If either of the two -/// lower 32-bit values is NaN, 0 is returned. -static __inline__ int __DEFAULT_FN_ATTRS -_mm_comilt_ss(__m128 __a, __m128 __b) -{ - return __builtin_ia32_comilt((__v4sf)__a, (__v4sf)__b); -} - -/// Compares two 32-bit float values in the low-order bits of both -/// operands to determine if the first operand is less than or equal to the -/// second operand and returns the result of the comparison. -/// -/// If either of the two lower 32-bit values is NaN, 0 is returned. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCOMISS / COMISS instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are -/// used in the comparison. -/// \param __b -/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are -/// used in the comparison. -/// \returns An integer containing the comparison results. If either of the two -/// lower 32-bit values is NaN, 0 is returned. -static __inline__ int __DEFAULT_FN_ATTRS -_mm_comile_ss(__m128 __a, __m128 __b) -{ - return __builtin_ia32_comile((__v4sf)__a, (__v4sf)__b); -} - -/// Compares two 32-bit float values in the low-order bits of both -/// operands to determine if the first operand is greater than the second -/// operand and returns the result of the comparison. -/// -/// If either of the two lower 32-bit values is NaN, 0 is returned. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCOMISS / COMISS instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are -/// used in the comparison. -/// \param __b -/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are -/// used in the comparison. -/// \returns An integer containing the comparison results. If either of the -/// two lower 32-bit values is NaN, 0 is returned. -static __inline__ int __DEFAULT_FN_ATTRS -_mm_comigt_ss(__m128 __a, __m128 __b) -{ - return __builtin_ia32_comigt((__v4sf)__a, (__v4sf)__b); -} - -/// Compares two 32-bit float values in the low-order bits of both -/// operands to determine if the first operand is greater than or equal to -/// the second operand and returns the result of the comparison. -/// -/// If either of the two lower 32-bit values is NaN, 0 is returned. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCOMISS / COMISS instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are -/// used in the comparison. -/// \param __b -/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are -/// used in the comparison. -/// \returns An integer containing the comparison results. If either of the two -/// lower 32-bit values is NaN, 0 is returned. -static __inline__ int __DEFAULT_FN_ATTRS -_mm_comige_ss(__m128 __a, __m128 __b) -{ - return __builtin_ia32_comige((__v4sf)__a, (__v4sf)__b); -} - -/// Compares two 32-bit float values in the low-order bits of both -/// operands to determine if the first operand is not equal to the second -/// operand and returns the result of the comparison. -/// -/// If either of the two lower 32-bit values is NaN, 1 is returned. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCOMISS / COMISS instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are -/// used in the comparison. -/// \param __b -/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are -/// used in the comparison. -/// \returns An integer containing the comparison results. If either of the -/// two lower 32-bit values is NaN, 1 is returned. -static __inline__ int __DEFAULT_FN_ATTRS -_mm_comineq_ss(__m128 __a, __m128 __b) -{ - return __builtin_ia32_comineq((__v4sf)__a, (__v4sf)__b); -} - -/// Performs an unordered comparison of two 32-bit float values using -/// the low-order bits of both operands to determine equality and returns -/// the result of the comparison. -/// -/// If either of the two lower 32-bit values is NaN, 0 is returned. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VUCOMISS / UCOMISS instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are -/// used in the comparison. -/// \param __b -/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are -/// used in the comparison. -/// \returns An integer containing the comparison results. If either of the two -/// lower 32-bit values is NaN, 0 is returned. -static __inline__ int __DEFAULT_FN_ATTRS -_mm_ucomieq_ss(__m128 __a, __m128 __b) -{ - return __builtin_ia32_ucomieq((__v4sf)__a, (__v4sf)__b); -} - -/// Performs an unordered comparison of two 32-bit float values using -/// the low-order bits of both operands to determine if the first operand is -/// less than the second operand and returns the result of the comparison. -/// -/// If either of the two lower 32-bit values is NaN, 0 is returned. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VUCOMISS / UCOMISS instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are -/// used in the comparison. -/// \param __b -/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are -/// used in the comparison. -/// \returns An integer containing the comparison results. If either of the two -/// lower 32-bit values is NaN, 0 is returned. -static __inline__ int __DEFAULT_FN_ATTRS -_mm_ucomilt_ss(__m128 __a, __m128 __b) -{ - return __builtin_ia32_ucomilt((__v4sf)__a, (__v4sf)__b); -} - -/// Performs an unordered comparison of two 32-bit float values using -/// the low-order bits of both operands to determine if the first operand is -/// less than or equal to the second operand and returns the result of the -/// comparison. -/// -/// If either of the two lower 32-bit values is NaN, 0 is returned. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VUCOMISS / UCOMISS instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are -/// used in the comparison. -/// \param __b -/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are -/// used in the comparison. -/// \returns An integer containing the comparison results. If either of the two -/// lower 32-bit values is NaN, 0 is returned. -static __inline__ int __DEFAULT_FN_ATTRS -_mm_ucomile_ss(__m128 __a, __m128 __b) -{ - return __builtin_ia32_ucomile((__v4sf)__a, (__v4sf)__b); -} - -/// Performs an unordered comparison of two 32-bit float values using -/// the low-order bits of both operands to determine if the first operand is -/// greater than the second operand and returns the result of the -/// comparison. -/// -/// If either of the two lower 32-bit values is NaN, 0 is returned. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VUCOMISS / UCOMISS instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are -/// used in the comparison. -/// \param __b -/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are -/// used in the comparison. -/// \returns An integer containing the comparison results. If either of the two -/// lower 32-bit values is NaN, 0 is returned. -static __inline__ int __DEFAULT_FN_ATTRS -_mm_ucomigt_ss(__m128 __a, __m128 __b) -{ - return __builtin_ia32_ucomigt((__v4sf)__a, (__v4sf)__b); -} - -/// Performs an unordered comparison of two 32-bit float values using -/// the low-order bits of both operands to determine if the first operand is -/// greater than or equal to the second operand and returns the result of -/// the comparison. -/// -/// If either of the two lower 32-bit values is NaN, 0 is returned. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VUCOMISS / UCOMISS instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are -/// used in the comparison. -/// \param __b -/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are -/// used in the comparison. -/// \returns An integer containing the comparison results. If either of the two -/// lower 32-bit values is NaN, 0 is returned. -static __inline__ int __DEFAULT_FN_ATTRS -_mm_ucomige_ss(__m128 __a, __m128 __b) -{ - return __builtin_ia32_ucomige((__v4sf)__a, (__v4sf)__b); -} - -/// Performs an unordered comparison of two 32-bit float values using -/// the low-order bits of both operands to determine inequality and returns -/// the result of the comparison. -/// -/// If either of the two lower 32-bit values is NaN, 1 is returned. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VUCOMISS / UCOMISS instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are -/// used in the comparison. -/// \param __b -/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are -/// used in the comparison. -/// \returns An integer containing the comparison results. If either of the two -/// lower 32-bit values is NaN, 1 is returned. -static __inline__ int __DEFAULT_FN_ATTRS -_mm_ucomineq_ss(__m128 __a, __m128 __b) -{ - return __builtin_ia32_ucomineq((__v4sf)__a, (__v4sf)__b); -} - -/// Converts a float value contained in the lower 32 bits of a vector of -/// [4 x float] into a 32-bit integer. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCVTSS2SI / CVTSS2SI -/// instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are -/// used in the conversion. -/// \returns A 32-bit integer containing the converted value. -static __inline__ int __DEFAULT_FN_ATTRS -_mm_cvtss_si32(__m128 __a) -{ - return __builtin_ia32_cvtss2si((__v4sf)__a); -} - -/// Converts a float value contained in the lower 32 bits of a vector of -/// [4 x float] into a 32-bit integer. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCVTSS2SI / CVTSS2SI -/// instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are -/// used in the conversion. -/// \returns A 32-bit integer containing the converted value. -static __inline__ int __DEFAULT_FN_ATTRS -_mm_cvt_ss2si(__m128 __a) -{ - return _mm_cvtss_si32(__a); -} - -#ifdef __x86_64__ - -/// Converts a float value contained in the lower 32 bits of a vector of -/// [4 x float] into a 64-bit integer. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCVTSS2SI / CVTSS2SI -/// instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are -/// used in the conversion. -/// \returns A 64-bit integer containing the converted value. -static __inline__ long long __DEFAULT_FN_ATTRS -_mm_cvtss_si64(__m128 __a) -{ - return __builtin_ia32_cvtss2si64((__v4sf)__a); -} - -#endif - -/// Converts two low-order float values in a 128-bit vector of -/// [4 x float] into a 64-bit vector of [2 x i32]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the CVTPS2PI instruction. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. -/// \returns A 64-bit integer vector containing the converted values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX -_mm_cvtps_pi32(__m128 __a) -{ - return (__m64)__builtin_ia32_cvtps2pi((__v4sf)__a); -} - -/// Converts two low-order float values in a 128-bit vector of -/// [4 x float] into a 64-bit vector of [2 x i32]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the CVTPS2PI instruction. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. -/// \returns A 64-bit integer vector containing the converted values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX -_mm_cvt_ps2pi(__m128 __a) -{ - return _mm_cvtps_pi32(__a); -} - -/// Converts a float value contained in the lower 32 bits of a vector of -/// [4 x float] into a 32-bit integer, truncating the result when it is -/// inexact. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCVTTSS2SI / CVTTSS2SI -/// instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are -/// used in the conversion. -/// \returns A 32-bit integer containing the converted value. -static __inline__ int __DEFAULT_FN_ATTRS -_mm_cvttss_si32(__m128 __a) -{ - return __builtin_ia32_cvttss2si((__v4sf)__a); -} - -/// Converts a float value contained in the lower 32 bits of a vector of -/// [4 x float] into a 32-bit integer, truncating the result when it is -/// inexact. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCVTTSS2SI / CVTTSS2SI -/// instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are -/// used in the conversion. -/// \returns A 32-bit integer containing the converted value. -static __inline__ int __DEFAULT_FN_ATTRS -_mm_cvtt_ss2si(__m128 __a) -{ - return _mm_cvttss_si32(__a); -} - -#ifdef __x86_64__ -/// Converts a float value contained in the lower 32 bits of a vector of -/// [4 x float] into a 64-bit integer, truncating the result when it is -/// inexact. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCVTTSS2SI / CVTTSS2SI -/// instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are -/// used in the conversion. -/// \returns A 64-bit integer containing the converted value. -static __inline__ long long __DEFAULT_FN_ATTRS -_mm_cvttss_si64(__m128 __a) -{ - return __builtin_ia32_cvttss2si64((__v4sf)__a); -} -#endif - -/// Converts two low-order float values in a 128-bit vector of -/// [4 x float] into a 64-bit vector of [2 x i32], truncating the result -/// when it is inexact. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the CVTTPS2PI / VTTPS2PI -/// instructions. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. -/// \returns A 64-bit integer vector containing the converted values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX -_mm_cvttps_pi32(__m128 __a) -{ - return (__m64)__builtin_ia32_cvttps2pi((__v4sf)__a); -} - -/// Converts two low-order float values in a 128-bit vector of [4 x -/// float] into a 64-bit vector of [2 x i32], truncating the result when it -/// is inexact. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the CVTTPS2PI instruction. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. -/// \returns A 64-bit integer vector containing the converted values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX -_mm_cvtt_ps2pi(__m128 __a) -{ - return _mm_cvttps_pi32(__a); -} - -/// Converts a 32-bit signed integer value into a floating point value -/// and writes it to the lower 32 bits of the destination. The remaining -/// higher order elements of the destination vector are copied from the -/// corresponding elements in the first operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCVTSI2SS / CVTSI2SS instruction. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. -/// \param __b -/// A 32-bit signed integer operand containing the value to be converted. -/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the -/// converted value of the second operand. The upper 96 bits are copied from -/// the upper 96 bits of the first operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_cvtsi32_ss(__m128 __a, int __b) -{ - __a[0] = __b; - return __a; -} - -/// Converts a 32-bit signed integer value into a floating point value -/// and writes it to the lower 32 bits of the destination. The remaining -/// higher order elements of the destination are copied from the -/// corresponding elements in the first operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCVTSI2SS / CVTSI2SS instruction. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. -/// \param __b -/// A 32-bit signed integer operand containing the value to be converted. -/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the -/// converted value of the second operand. The upper 96 bits are copied from -/// the upper 96 bits of the first operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_cvt_si2ss(__m128 __a, int __b) -{ - return _mm_cvtsi32_ss(__a, __b); -} - -#ifdef __x86_64__ - -/// Converts a 64-bit signed integer value into a floating point value -/// and writes it to the lower 32 bits of the destination. The remaining -/// higher order elements of the destination are copied from the -/// corresponding elements in the first operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VCVTSI2SS / CVTSI2SS instruction. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. -/// \param __b -/// A 64-bit signed integer operand containing the value to be converted. -/// \returns A 128-bit vector of [4 x float] whose lower 32 bits contain the -/// converted value of the second operand. The upper 96 bits are copied from -/// the upper 96 bits of the first operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_cvtsi64_ss(__m128 __a, long long __b) -{ - __a[0] = __b; - return __a; -} - -#endif - -/// Converts two elements of a 64-bit vector of [2 x i32] into two -/// floating point values and writes them to the lower 64-bits of the -/// destination. The remaining higher order elements of the destination are -/// copied from the corresponding elements in the first operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the CVTPI2PS instruction. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. -/// \param __b -/// A 64-bit vector of [2 x i32]. The elements in this vector are converted -/// and written to the corresponding low-order elements in the destination. -/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the -/// converted value of the second operand. The upper 64 bits are copied from -/// the upper 64 bits of the first operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX -_mm_cvtpi32_ps(__m128 __a, __m64 __b) -{ - return __builtin_ia32_cvtpi2ps((__v4sf)__a, (__v2si)__b); -} - -/// Converts two elements of a 64-bit vector of [2 x i32] into two -/// floating point values and writes them to the lower 64-bits of the -/// destination. The remaining higher order elements of the destination are -/// copied from the corresponding elements in the first operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the CVTPI2PS instruction. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. -/// \param __b -/// A 64-bit vector of [2 x i32]. The elements in this vector are converted -/// and written to the corresponding low-order elements in the destination. -/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the -/// converted value from the second operand. The upper 64 bits are copied -/// from the upper 64 bits of the first operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX -_mm_cvt_pi2ps(__m128 __a, __m64 __b) -{ - return _mm_cvtpi32_ps(__a, __b); -} - -/// Extracts a float value contained in the lower 32 bits of a vector of -/// [4 x float]. -/// -/// \headerfile -/// -/// This intrinsic has no corresponding instruction. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. The lower 32 bits of this operand are -/// used in the extraction. -/// \returns A 32-bit float containing the extracted value. -static __inline__ float __DEFAULT_FN_ATTRS -_mm_cvtss_f32(__m128 __a) -{ - return __a[0]; -} - -/// Loads two packed float values from the address \a __p into the -/// high-order bits of a 128-bit vector of [4 x float]. The low-order bits -/// are copied from the low-order bits of the first operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVHPD / MOVHPD instruction. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. Bits [63:0] are written to bits [63:0] -/// of the destination. -/// \param __p -/// A pointer to two packed float values. Bits [63:0] are written to bits -/// [127:64] of the destination. -/// \returns A 128-bit vector of [4 x float] containing the moved values. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_loadh_pi(__m128 __a, const __m64 *__p) -{ - typedef float __mm_loadh_pi_v2f32 __attribute__((__vector_size__(8))); - struct __mm_loadh_pi_struct { - __mm_loadh_pi_v2f32 __u; - } __attribute__((__packed__, __may_alias__)); - __mm_loadh_pi_v2f32 __b = ((const struct __mm_loadh_pi_struct*)__p)->__u; - __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1); - return __builtin_shufflevector(__a, __bb, 0, 1, 4, 5); -} - -/// Loads two packed float values from the address \a __p into the -/// low-order bits of a 128-bit vector of [4 x float]. The high-order bits -/// are copied from the high-order bits of the first operand. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVLPD / MOVLPD instruction. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. Bits [127:64] are written to bits -/// [127:64] of the destination. -/// \param __p -/// A pointer to two packed float values. Bits [63:0] are written to bits -/// [63:0] of the destination. -/// \returns A 128-bit vector of [4 x float] containing the moved values. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_loadl_pi(__m128 __a, const __m64 *__p) -{ - typedef float __mm_loadl_pi_v2f32 __attribute__((__vector_size__(8))); - struct __mm_loadl_pi_struct { - __mm_loadl_pi_v2f32 __u; - } __attribute__((__packed__, __may_alias__)); - __mm_loadl_pi_v2f32 __b = ((const struct __mm_loadl_pi_struct*)__p)->__u; - __m128 __bb = __builtin_shufflevector(__b, __b, 0, 1, 0, 1); - return __builtin_shufflevector(__a, __bb, 4, 5, 2, 3); -} - -/// Constructs a 128-bit floating-point vector of [4 x float]. The lower -/// 32 bits of the vector are initialized with the single-precision -/// floating-point value loaded from a specified memory location. The upper -/// 96 bits are set to zero. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVSS / MOVSS instruction. -/// -/// \param __p -/// A pointer to a 32-bit memory location containing a single-precision -/// floating-point value. -/// \returns An initialized 128-bit floating-point vector of [4 x float]. The -/// lower 32 bits contain the value loaded from the memory location. The -/// upper 96 bits are set to zero. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_load_ss(const float *__p) -{ - struct __mm_load_ss_struct { - float __u; - } __attribute__((__packed__, __may_alias__)); - float __u = ((const struct __mm_load_ss_struct*)__p)->__u; - return __extension__ (__m128){ __u, 0, 0, 0 }; -} - -/// Loads a 32-bit float value and duplicates it to all four vector -/// elements of a 128-bit vector of [4 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VBROADCASTSS / MOVSS + shuffling -/// instruction. -/// -/// \param __p -/// A pointer to a float value to be loaded and duplicated. -/// \returns A 128-bit vector of [4 x float] containing the loaded and -/// duplicated values. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_load1_ps(const float *__p) -{ - struct __mm_load1_ps_struct { - float __u; - } __attribute__((__packed__, __may_alias__)); - float __u = ((const struct __mm_load1_ps_struct*)__p)->__u; - return __extension__ (__m128){ __u, __u, __u, __u }; -} - -#define _mm_load_ps1(p) _mm_load1_ps(p) - -/// Loads a 128-bit floating-point vector of [4 x float] from an aligned -/// memory location. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVAPS / MOVAPS instruction. -/// -/// \param __p -/// A pointer to a 128-bit memory location. The address of the memory -/// location has to be 128-bit aligned. -/// \returns A 128-bit vector of [4 x float] containing the loaded values. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_load_ps(const float *__p) -{ - return *(const __m128*)__p; -} - -/// Loads a 128-bit floating-point vector of [4 x float] from an -/// unaligned memory location. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVUPS / MOVUPS instruction. -/// -/// \param __p -/// A pointer to a 128-bit memory location. The address of the memory -/// location does not have to be aligned. -/// \returns A 128-bit vector of [4 x float] containing the loaded values. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_loadu_ps(const float *__p) -{ - struct __loadu_ps { - __m128_u __v; - } __attribute__((__packed__, __may_alias__)); - return ((const struct __loadu_ps*)__p)->__v; -} - -/// Loads four packed float values, in reverse order, from an aligned -/// memory location to 32-bit elements in a 128-bit vector of [4 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVAPS / MOVAPS + shuffling -/// instruction. -/// -/// \param __p -/// A pointer to a 128-bit memory location. The address of the memory -/// location has to be 128-bit aligned. -/// \returns A 128-bit vector of [4 x float] containing the moved values, loaded -/// in reverse order. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_loadr_ps(const float *__p) -{ - __m128 __a = _mm_load_ps(__p); - return __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0); -} - -/// Create a 128-bit vector of [4 x float] with undefined values. -/// -/// \headerfile -/// -/// This intrinsic has no corresponding instruction. -/// -/// \returns A 128-bit vector of [4 x float] containing undefined values. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_undefined_ps(void) -{ - return (__m128)__builtin_ia32_undef128(); -} - -/// Constructs a 128-bit floating-point vector of [4 x float]. The lower -/// 32 bits of the vector are initialized with the specified single-precision -/// floating-point value. The upper 96 bits are set to zero. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVSS / MOVSS instruction. -/// -/// \param __w -/// A single-precision floating-point value used to initialize the lower 32 -/// bits of the result. -/// \returns An initialized 128-bit floating-point vector of [4 x float]. The -/// lower 32 bits contain the value provided in the source operand. The -/// upper 96 bits are set to zero. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_set_ss(float __w) -{ - return __extension__ (__m128){ __w, 0, 0, 0 }; -} - -/// Constructs a 128-bit floating-point vector of [4 x float], with each -/// of the four single-precision floating-point vector elements set to the -/// specified single-precision floating-point value. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPERMILPS / PERMILPS instruction. -/// -/// \param __w -/// A single-precision floating-point value used to initialize each vector -/// element of the result. -/// \returns An initialized 128-bit floating-point vector of [4 x float]. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_set1_ps(float __w) -{ - return __extension__ (__m128){ __w, __w, __w, __w }; -} - -/* Microsoft specific. */ -/// Constructs a 128-bit floating-point vector of [4 x float], with each -/// of the four single-precision floating-point vector elements set to the -/// specified single-precision floating-point value. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPERMILPS / PERMILPS instruction. -/// -/// \param __w -/// A single-precision floating-point value used to initialize each vector -/// element of the result. -/// \returns An initialized 128-bit floating-point vector of [4 x float]. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_set_ps1(float __w) -{ - return _mm_set1_ps(__w); -} - -/// Constructs a 128-bit floating-point vector of [4 x float] -/// initialized with the specified single-precision floating-point values. -/// -/// \headerfile -/// -/// This intrinsic is a utility function and does not correspond to a specific -/// instruction. -/// -/// \param __z -/// A single-precision floating-point value used to initialize bits [127:96] -/// of the result. -/// \param __y -/// A single-precision floating-point value used to initialize bits [95:64] -/// of the result. -/// \param __x -/// A single-precision floating-point value used to initialize bits [63:32] -/// of the result. -/// \param __w -/// A single-precision floating-point value used to initialize bits [31:0] -/// of the result. -/// \returns An initialized 128-bit floating-point vector of [4 x float]. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_set_ps(float __z, float __y, float __x, float __w) -{ - return __extension__ (__m128){ __w, __x, __y, __z }; -} - -/// Constructs a 128-bit floating-point vector of [4 x float], -/// initialized in reverse order with the specified 32-bit single-precision -/// float-point values. -/// -/// \headerfile -/// -/// This intrinsic is a utility function and does not correspond to a specific -/// instruction. -/// -/// \param __z -/// A single-precision floating-point value used to initialize bits [31:0] -/// of the result. -/// \param __y -/// A single-precision floating-point value used to initialize bits [63:32] -/// of the result. -/// \param __x -/// A single-precision floating-point value used to initialize bits [95:64] -/// of the result. -/// \param __w -/// A single-precision floating-point value used to initialize bits [127:96] -/// of the result. -/// \returns An initialized 128-bit floating-point vector of [4 x float]. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_setr_ps(float __z, float __y, float __x, float __w) -{ - return __extension__ (__m128){ __z, __y, __x, __w }; -} - -/// Constructs a 128-bit floating-point vector of [4 x float] initialized -/// to zero. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VXORPS / XORPS instruction. -/// -/// \returns An initialized 128-bit floating-point vector of [4 x float] with -/// all elements set to zero. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_setzero_ps(void) -{ - return __extension__ (__m128){ 0, 0, 0, 0 }; -} - -/// Stores the upper 64 bits of a 128-bit vector of [4 x float] to a -/// memory location. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VPEXTRQ / PEXTRQ instruction. -/// -/// \param __p -/// A pointer to a 64-bit memory location. -/// \param __a -/// A 128-bit vector of [4 x float] containing the values to be stored. -static __inline__ void __DEFAULT_FN_ATTRS -_mm_storeh_pi(__m64 *__p, __m128 __a) -{ - typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8))); - struct __mm_storeh_pi_struct { - __mm_storeh_pi_v2f32 __u; - } __attribute__((__packed__, __may_alias__)); - ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 2, 3); -} - -/// Stores the lower 64 bits of a 128-bit vector of [4 x float] to a -/// memory location. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVLPS / MOVLPS instruction. -/// -/// \param __p -/// A pointer to a memory location that will receive the float values. -/// \param __a -/// A 128-bit vector of [4 x float] containing the values to be stored. -static __inline__ void __DEFAULT_FN_ATTRS -_mm_storel_pi(__m64 *__p, __m128 __a) -{ - typedef float __mm_storeh_pi_v2f32 __attribute__((__vector_size__(8))); - struct __mm_storeh_pi_struct { - __mm_storeh_pi_v2f32 __u; - } __attribute__((__packed__, __may_alias__)); - ((struct __mm_storeh_pi_struct*)__p)->__u = __builtin_shufflevector(__a, __a, 0, 1); -} - -/// Stores the lower 32 bits of a 128-bit vector of [4 x float] to a -/// memory location. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVSS / MOVSS instruction. -/// -/// \param __p -/// A pointer to a 32-bit memory location. -/// \param __a -/// A 128-bit vector of [4 x float] containing the value to be stored. -static __inline__ void __DEFAULT_FN_ATTRS -_mm_store_ss(float *__p, __m128 __a) -{ - struct __mm_store_ss_struct { - float __u; - } __attribute__((__packed__, __may_alias__)); - ((struct __mm_store_ss_struct*)__p)->__u = __a[0]; -} - -/// Stores a 128-bit vector of [4 x float] to an unaligned memory -/// location. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVUPS / MOVUPS instruction. -/// -/// \param __p -/// A pointer to a 128-bit memory location. The address of the memory -/// location does not have to be aligned. -/// \param __a -/// A 128-bit vector of [4 x float] containing the values to be stored. -static __inline__ void __DEFAULT_FN_ATTRS -_mm_storeu_ps(float *__p, __m128 __a) -{ - struct __storeu_ps { - __m128_u __v; - } __attribute__((__packed__, __may_alias__)); - ((struct __storeu_ps*)__p)->__v = __a; -} - -/// Stores a 128-bit vector of [4 x float] into an aligned memory -/// location. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVAPS / MOVAPS instruction. -/// -/// \param __p -/// A pointer to a 128-bit memory location. The address of the memory -/// location has to be 16-byte aligned. -/// \param __a -/// A 128-bit vector of [4 x float] containing the values to be stored. -static __inline__ void __DEFAULT_FN_ATTRS -_mm_store_ps(float *__p, __m128 __a) -{ - *(__m128*)__p = __a; -} - -/// Stores the lower 32 bits of a 128-bit vector of [4 x float] into -/// four contiguous elements in an aligned memory location. -/// -/// \headerfile -/// -/// This intrinsic corresponds to VMOVAPS / MOVAPS + shuffling -/// instruction. -/// -/// \param __p -/// A pointer to a 128-bit memory location. -/// \param __a -/// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each -/// of the four contiguous elements pointed by \a __p. -static __inline__ void __DEFAULT_FN_ATTRS -_mm_store1_ps(float *__p, __m128 __a) -{ - __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 0, 0, 0, 0); - _mm_store_ps(__p, __a); -} - -/// Stores the lower 32 bits of a 128-bit vector of [4 x float] into -/// four contiguous elements in an aligned memory location. -/// -/// \headerfile -/// -/// This intrinsic corresponds to VMOVAPS / MOVAPS + shuffling -/// instruction. -/// -/// \param __p -/// A pointer to a 128-bit memory location. -/// \param __a -/// A 128-bit vector of [4 x float] whose lower 32 bits are stored to each -/// of the four contiguous elements pointed by \a __p. -static __inline__ void __DEFAULT_FN_ATTRS -_mm_store_ps1(float *__p, __m128 __a) -{ - _mm_store1_ps(__p, __a); -} - -/// Stores float values from a 128-bit vector of [4 x float] to an -/// aligned memory location in reverse order. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVAPS / MOVAPS + shuffling -/// instruction. -/// -/// \param __p -/// A pointer to a 128-bit memory location. The address of the memory -/// location has to be 128-bit aligned. -/// \param __a -/// A 128-bit vector of [4 x float] containing the values to be stored. -static __inline__ void __DEFAULT_FN_ATTRS -_mm_storer_ps(float *__p, __m128 __a) -{ - __a = __builtin_shufflevector((__v4sf)__a, (__v4sf)__a, 3, 2, 1, 0); - _mm_store_ps(__p, __a); -} - -#define _MM_HINT_ET0 7 -#define _MM_HINT_ET1 6 -#define _MM_HINT_T0 3 -#define _MM_HINT_T1 2 -#define _MM_HINT_T2 1 -#define _MM_HINT_NTA 0 - -#ifndef _MSC_VER -/* FIXME: We have to #define this because "sel" must be a constant integer, and - Sema doesn't do any form of constant propagation yet. */ - -/// Loads one cache line of data from the specified address to a location -/// closer to the processor. -/// -/// \headerfile -/// -/// \code -/// void _mm_prefetch(const void * a, const int sel); -/// \endcode -/// -/// This intrinsic corresponds to the PREFETCHNTA instruction. -/// -/// \param a -/// A pointer to a memory location containing a cache line of data. -/// \param sel -/// A predefined integer constant specifying the type of prefetch -/// operation: \n -/// _MM_HINT_NTA: Move data using the non-temporal access (NTA) hint. The -/// PREFETCHNTA instruction will be generated. \n -/// _MM_HINT_T0: Move data using the T0 hint. The PREFETCHT0 instruction will -/// be generated. \n -/// _MM_HINT_T1: Move data using the T1 hint. The PREFETCHT1 instruction will -/// be generated. \n -/// _MM_HINT_T2: Move data using the T2 hint. The PREFETCHT2 instruction will -/// be generated. -#define _mm_prefetch(a, sel) (__builtin_prefetch((const void *)(a), \ - ((sel) >> 2) & 1, (sel) & 0x3)) -#endif - -/// Stores a 64-bit integer in the specified aligned memory location. To -/// minimize caching, the data is flagged as non-temporal (unlikely to be -/// used again soon). -/// -/// \headerfile -/// -/// This intrinsic corresponds to the MOVNTQ instruction. -/// -/// \param __p -/// A pointer to an aligned memory location used to store the register value. -/// \param __a -/// A 64-bit integer containing the value to be stored. -static __inline__ void __DEFAULT_FN_ATTRS_MMX -_mm_stream_pi(__m64 *__p, __m64 __a) -{ - __builtin_ia32_movntq(__p, __a); -} - -/// Moves packed float values from a 128-bit vector of [4 x float] to a -/// 128-bit aligned memory location. To minimize caching, the data is flagged -/// as non-temporal (unlikely to be used again soon). -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVNTPS / MOVNTPS instruction. -/// -/// \param __p -/// A pointer to a 128-bit aligned memory location that will receive the -/// single-precision floating-point values. -/// \param __a -/// A 128-bit vector of [4 x float] containing the values to be moved. -static __inline__ void __DEFAULT_FN_ATTRS -_mm_stream_ps(float *__p, __m128 __a) -{ - __builtin_nontemporal_store((__v4sf)__a, (__v4sf*)__p); -} - -#if defined(__cplusplus) -extern "C" { -#endif - -/// Forces strong memory ordering (serialization) between store -/// instructions preceding this instruction and store instructions following -/// this instruction, ensuring the system completes all previous stores -/// before executing subsequent stores. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the SFENCE instruction. -/// -void _mm_sfence(void); - -#if defined(__cplusplus) -} // extern "C" -#endif - -/// Extracts 16-bit element from a 64-bit vector of [4 x i16] and -/// returns it, as specified by the immediate integer operand. -/// -/// \headerfile -/// -/// \code -/// int _mm_extract_pi16(__m64 a, int n); -/// \endcode -/// -/// This intrinsic corresponds to the VPEXTRW / PEXTRW instruction. -/// -/// \param a -/// A 64-bit vector of [4 x i16]. -/// \param n -/// An immediate integer operand that determines which bits are extracted: \n -/// 0: Bits [15:0] are copied to the destination. \n -/// 1: Bits [31:16] are copied to the destination. \n -/// 2: Bits [47:32] are copied to the destination. \n -/// 3: Bits [63:48] are copied to the destination. -/// \returns A 16-bit integer containing the extracted 16 bits of packed data. -#define _mm_extract_pi16(a, n) \ - ((int)__builtin_ia32_vec_ext_v4hi((__v4hi)a, (int)n)) - -/// Copies data from the 64-bit vector of [4 x i16] to the destination, -/// and inserts the lower 16-bits of an integer operand at the 16-bit offset -/// specified by the immediate operand \a n. -/// -/// \headerfile -/// -/// \code -/// __m64 _mm_insert_pi16(__m64 a, int d, int n); -/// \endcode -/// -/// This intrinsic corresponds to the PINSRW instruction. -/// -/// \param a -/// A 64-bit vector of [4 x i16]. -/// \param d -/// An integer. The lower 16-bit value from this operand is written to the -/// destination at the offset specified by operand \a n. -/// \param n -/// An immediate integer operant that determines which the bits to be used -/// in the destination. \n -/// 0: Bits [15:0] are copied to the destination. \n -/// 1: Bits [31:16] are copied to the destination. \n -/// 2: Bits [47:32] are copied to the destination. \n -/// 3: Bits [63:48] are copied to the destination. \n -/// The remaining bits in the destination are copied from the corresponding -/// bits in operand \a a. -/// \returns A 64-bit integer vector containing the copied packed data from the -/// operands. -#define _mm_insert_pi16(a, d, n) \ - ((__m64)__builtin_ia32_vec_set_v4hi((__v4hi)a, (int)d, (int)n)) - -/// Compares each of the corresponding packed 16-bit integer values of -/// the 64-bit integer vectors, and writes the greater value to the -/// corresponding bits in the destination. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PMAXSW instruction. -/// -/// \param __a -/// A 64-bit integer vector containing one of the source operands. -/// \param __b -/// A 64-bit integer vector containing one of the source operands. -/// \returns A 64-bit integer vector containing the comparison results. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX -_mm_max_pi16(__m64 __a, __m64 __b) -{ - return (__m64)__builtin_ia32_pmaxsw((__v4hi)__a, (__v4hi)__b); -} - -/// Compares each of the corresponding packed 8-bit unsigned integer -/// values of the 64-bit integer vectors, and writes the greater value to the -/// corresponding bits in the destination. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PMAXUB instruction. -/// -/// \param __a -/// A 64-bit integer vector containing one of the source operands. -/// \param __b -/// A 64-bit integer vector containing one of the source operands. -/// \returns A 64-bit integer vector containing the comparison results. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX -_mm_max_pu8(__m64 __a, __m64 __b) -{ - return (__m64)__builtin_ia32_pmaxub((__v8qi)__a, (__v8qi)__b); -} - -/// Compares each of the corresponding packed 16-bit integer values of -/// the 64-bit integer vectors, and writes the lesser value to the -/// corresponding bits in the destination. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PMINSW instruction. -/// -/// \param __a -/// A 64-bit integer vector containing one of the source operands. -/// \param __b -/// A 64-bit integer vector containing one of the source operands. -/// \returns A 64-bit integer vector containing the comparison results. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX -_mm_min_pi16(__m64 __a, __m64 __b) -{ - return (__m64)__builtin_ia32_pminsw((__v4hi)__a, (__v4hi)__b); -} - -/// Compares each of the corresponding packed 8-bit unsigned integer -/// values of the 64-bit integer vectors, and writes the lesser value to the -/// corresponding bits in the destination. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PMINUB instruction. -/// -/// \param __a -/// A 64-bit integer vector containing one of the source operands. -/// \param __b -/// A 64-bit integer vector containing one of the source operands. -/// \returns A 64-bit integer vector containing the comparison results. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX -_mm_min_pu8(__m64 __a, __m64 __b) -{ - return (__m64)__builtin_ia32_pminub((__v8qi)__a, (__v8qi)__b); -} - -/// Takes the most significant bit from each 8-bit element in a 64-bit -/// integer vector to create an 8-bit mask value. Zero-extends the value to -/// 32-bit integer and writes it to the destination. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PMOVMSKB instruction. -/// -/// \param __a -/// A 64-bit integer vector containing the values with bits to be extracted. -/// \returns The most significant bit from each 8-bit element in \a __a, -/// written to bits [7:0]. -static __inline__ int __DEFAULT_FN_ATTRS_MMX -_mm_movemask_pi8(__m64 __a) -{ - return __builtin_ia32_pmovmskb((__v8qi)__a); -} - -/// Multiplies packed 16-bit unsigned integer values and writes the -/// high-order 16 bits of each 32-bit product to the corresponding bits in -/// the destination. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PMULHUW instruction. -/// -/// \param __a -/// A 64-bit integer vector containing one of the source operands. -/// \param __b -/// A 64-bit integer vector containing one of the source operands. -/// \returns A 64-bit integer vector containing the products of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX -_mm_mulhi_pu16(__m64 __a, __m64 __b) -{ - return (__m64)__builtin_ia32_pmulhuw((__v4hi)__a, (__v4hi)__b); -} - -/// Shuffles the 4 16-bit integers from a 64-bit integer vector to the -/// destination, as specified by the immediate value operand. -/// -/// \headerfile -/// -/// \code -/// __m64 _mm_shuffle_pi16(__m64 a, const int n); -/// \endcode -/// -/// This intrinsic corresponds to the PSHUFW instruction. -/// -/// \param a -/// A 64-bit integer vector containing the values to be shuffled. -/// \param n -/// An immediate value containing an 8-bit value specifying which elements to -/// copy from \a a. The destinations within the 64-bit destination are -/// assigned values as follows: \n -/// Bits [1:0] are used to assign values to bits [15:0] in the -/// destination. \n -/// Bits [3:2] are used to assign values to bits [31:16] in the -/// destination. \n -/// Bits [5:4] are used to assign values to bits [47:32] in the -/// destination. \n -/// Bits [7:6] are used to assign values to bits [63:48] in the -/// destination. \n -/// Bit value assignments: \n -/// 00: assigned from bits [15:0] of \a a. \n -/// 01: assigned from bits [31:16] of \a a. \n -/// 10: assigned from bits [47:32] of \a a. \n -/// 11: assigned from bits [63:48] of \a a. -/// \returns A 64-bit integer vector containing the shuffled values. -#define _mm_shuffle_pi16(a, n) \ - ((__m64)__builtin_ia32_pshufw((__v4hi)(__m64)(a), (n))) - -/// Conditionally copies the values from each 8-bit element in the first -/// 64-bit integer vector operand to the specified memory location, as -/// specified by the most significant bit in the corresponding element in the -/// second 64-bit integer vector operand. -/// -/// To minimize caching, the data is flagged as non-temporal -/// (unlikely to be used again soon). -/// -/// \headerfile -/// -/// This intrinsic corresponds to the MASKMOVQ instruction. -/// -/// \param __d -/// A 64-bit integer vector containing the values with elements to be copied. -/// \param __n -/// A 64-bit integer vector operand. The most significant bit from each 8-bit -/// element determines whether the corresponding element in operand \a __d -/// is copied. If the most significant bit of a given element is 1, the -/// corresponding element in operand \a __d is copied. -/// \param __p -/// A pointer to a 64-bit memory location that will receive the conditionally -/// copied integer values. The address of the memory location does not have -/// to be aligned. -static __inline__ void __DEFAULT_FN_ATTRS_MMX -_mm_maskmove_si64(__m64 __d, __m64 __n, char *__p) -{ - __builtin_ia32_maskmovq((__v8qi)__d, (__v8qi)__n, __p); -} - -/// Computes the rounded averages of the packed unsigned 8-bit integer -/// values and writes the averages to the corresponding bits in the -/// destination. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PAVGB instruction. -/// -/// \param __a -/// A 64-bit integer vector containing one of the source operands. -/// \param __b -/// A 64-bit integer vector containing one of the source operands. -/// \returns A 64-bit integer vector containing the averages of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX -_mm_avg_pu8(__m64 __a, __m64 __b) -{ - return (__m64)__builtin_ia32_pavgb((__v8qi)__a, (__v8qi)__b); -} - -/// Computes the rounded averages of the packed unsigned 16-bit integer -/// values and writes the averages to the corresponding bits in the -/// destination. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PAVGW instruction. -/// -/// \param __a -/// A 64-bit integer vector containing one of the source operands. -/// \param __b -/// A 64-bit integer vector containing one of the source operands. -/// \returns A 64-bit integer vector containing the averages of both operands. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX -_mm_avg_pu16(__m64 __a, __m64 __b) -{ - return (__m64)__builtin_ia32_pavgw((__v4hi)__a, (__v4hi)__b); -} - -/// Subtracts the corresponding 8-bit unsigned integer values of the two -/// 64-bit vector operands and computes the absolute value for each of the -/// difference. Then sum of the 8 absolute differences is written to the -/// bits [15:0] of the destination; the remaining bits [63:16] are cleared. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the PSADBW instruction. -/// -/// \param __a -/// A 64-bit integer vector containing one of the source operands. -/// \param __b -/// A 64-bit integer vector containing one of the source operands. -/// \returns A 64-bit integer vector whose lower 16 bits contain the sums of the -/// sets of absolute differences between both operands. The upper bits are -/// cleared. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX -_mm_sad_pu8(__m64 __a, __m64 __b) -{ - return (__m64)__builtin_ia32_psadbw((__v8qi)__a, (__v8qi)__b); -} - -#if defined(__cplusplus) -extern "C" { -#endif - -/// Returns the contents of the MXCSR register as a 32-bit unsigned -/// integer value. -/// -/// There are several groups of macros associated with this -/// intrinsic, including: -///
    -///
  • -/// For checking exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO, -/// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW, -/// _MM_EXCEPT_INEXACT. There is a convenience wrapper -/// _MM_GET_EXCEPTION_STATE(). -///
  • -///
  • -/// For checking exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW, -/// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT. -/// There is a convenience wrapper _MM_GET_EXCEPTION_MASK(). -///
  • -///
  • -/// For checking rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, -/// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper -/// _MM_GET_ROUNDING_MODE(). -///
  • -///
  • -/// For checking flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF. -/// There is a convenience wrapper _MM_GET_FLUSH_ZERO_MODE(). -///
  • -///
  • -/// For checking denormals-are-zero mode: _MM_DENORMALS_ZERO_ON, -/// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper -/// _MM_GET_DENORMALS_ZERO_MODE(). -///
  • -///
-/// -/// For example, the following expression checks if an overflow exception has -/// occurred: -/// \code -/// ( _mm_getcsr() & _MM_EXCEPT_OVERFLOW ) -/// \endcode -/// -/// The following expression gets the current rounding mode: -/// \code -/// _MM_GET_ROUNDING_MODE() -/// \endcode -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VSTMXCSR / STMXCSR instruction. -/// -/// \returns A 32-bit unsigned integer containing the contents of the MXCSR -/// register. -unsigned int _mm_getcsr(void); - -/// Sets the MXCSR register with the 32-bit unsigned integer value. -/// -/// There are several groups of macros associated with this intrinsic, -/// including: -///
    -///
  • -/// For setting exception states: _MM_EXCEPT_INVALID, _MM_EXCEPT_DIV_ZERO, -/// _MM_EXCEPT_DENORM, _MM_EXCEPT_OVERFLOW, _MM_EXCEPT_UNDERFLOW, -/// _MM_EXCEPT_INEXACT. There is a convenience wrapper -/// _MM_SET_EXCEPTION_STATE(x) where x is one of these macros. -///
  • -///
  • -/// For setting exception masks: _MM_MASK_UNDERFLOW, _MM_MASK_OVERFLOW, -/// _MM_MASK_INVALID, _MM_MASK_DENORM, _MM_MASK_DIV_ZERO, _MM_MASK_INEXACT. -/// There is a convenience wrapper _MM_SET_EXCEPTION_MASK(x) where x is one -/// of these macros. -///
  • -///
  • -/// For setting rounding modes: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, -/// _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO. There is a convenience wrapper -/// _MM_SET_ROUNDING_MODE(x) where x is one of these macros. -///
  • -///
  • -/// For setting flush-to-zero mode: _MM_FLUSH_ZERO_ON, _MM_FLUSH_ZERO_OFF. -/// There is a convenience wrapper _MM_SET_FLUSH_ZERO_MODE(x) where x is -/// one of these macros. -///
  • -///
  • -/// For setting denormals-are-zero mode: _MM_DENORMALS_ZERO_ON, -/// _MM_DENORMALS_ZERO_OFF. There is a convenience wrapper -/// _MM_SET_DENORMALS_ZERO_MODE(x) where x is one of these macros. -///
  • -///
-/// -/// For example, the following expression causes subsequent floating-point -/// operations to round up: -/// _mm_setcsr(_mm_getcsr() | _MM_ROUND_UP) -/// -/// The following example sets the DAZ and FTZ flags: -/// \code -/// void setFlags() { -/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); -/// _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); -/// } -/// \endcode -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VLDMXCSR / LDMXCSR instruction. -/// -/// \param __i -/// A 32-bit unsigned integer value to be written to the MXCSR register. -void _mm_setcsr(unsigned int __i); - -#if defined(__cplusplus) -} // extern "C" -#endif - -/// Selects 4 float values from the 128-bit operands of [4 x float], as -/// specified by the immediate value operand. -/// -/// \headerfile -/// -/// \code -/// __m128 _mm_shuffle_ps(__m128 a, __m128 b, const int mask); -/// \endcode -/// -/// This intrinsic corresponds to the VSHUFPS / SHUFPS instruction. -/// -/// \param a -/// A 128-bit vector of [4 x float]. -/// \param b -/// A 128-bit vector of [4 x float]. -/// \param mask -/// An immediate value containing an 8-bit value specifying which elements to -/// copy from \a a and \a b. \n -/// Bits [3:0] specify the values copied from operand \a a. \n -/// Bits [7:4] specify the values copied from operand \a b. \n -/// The destinations within the 128-bit destination are assigned values as -/// follows: \n -/// Bits [1:0] are used to assign values to bits [31:0] in the -/// destination. \n -/// Bits [3:2] are used to assign values to bits [63:32] in the -/// destination. \n -/// Bits [5:4] are used to assign values to bits [95:64] in the -/// destination. \n -/// Bits [7:6] are used to assign values to bits [127:96] in the -/// destination. \n -/// Bit value assignments: \n -/// 00: Bits [31:0] copied from the specified operand. \n -/// 01: Bits [63:32] copied from the specified operand. \n -/// 10: Bits [95:64] copied from the specified operand. \n -/// 11: Bits [127:96] copied from the specified operand. -/// \returns A 128-bit vector of [4 x float] containing the shuffled values. -#define _mm_shuffle_ps(a, b, mask) \ - ((__m128)__builtin_ia32_shufps((__v4sf)(__m128)(a), (__v4sf)(__m128)(b), \ - (int)(mask))) - -/// Unpacks the high-order (index 2,3) values from two 128-bit vectors of -/// [4 x float] and interleaves them into a 128-bit vector of [4 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VUNPCKHPS / UNPCKHPS instruction. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. \n -/// Bits [95:64] are written to bits [31:0] of the destination. \n -/// Bits [127:96] are written to bits [95:64] of the destination. -/// \param __b -/// A 128-bit vector of [4 x float]. -/// Bits [95:64] are written to bits [63:32] of the destination. \n -/// Bits [127:96] are written to bits [127:96] of the destination. -/// \returns A 128-bit vector of [4 x float] containing the interleaved values. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_unpackhi_ps(__m128 __a, __m128 __b) -{ - return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 2, 6, 3, 7); -} - -/// Unpacks the low-order (index 0,1) values from two 128-bit vectors of -/// [4 x float] and interleaves them into a 128-bit vector of [4 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VUNPCKLPS / UNPCKLPS instruction. -/// -/// \param __a -/// A 128-bit vector of [4 x float]. \n -/// Bits [31:0] are written to bits [31:0] of the destination. \n -/// Bits [63:32] are written to bits [95:64] of the destination. -/// \param __b -/// A 128-bit vector of [4 x float]. \n -/// Bits [31:0] are written to bits [63:32] of the destination. \n -/// Bits [63:32] are written to bits [127:96] of the destination. -/// \returns A 128-bit vector of [4 x float] containing the interleaved values. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_unpacklo_ps(__m128 __a, __m128 __b) -{ - return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 4, 1, 5); -} - -/// Constructs a 128-bit floating-point vector of [4 x float]. The lower -/// 32 bits are set to the lower 32 bits of the second parameter. The upper -/// 96 bits are set to the upper 96 bits of the first parameter. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VBLENDPS / BLENDPS / MOVSS -/// instruction. -/// -/// \param __a -/// A 128-bit floating-point vector of [4 x float]. The upper 96 bits are -/// written to the upper 96 bits of the result. -/// \param __b -/// A 128-bit floating-point vector of [4 x float]. The lower 32 bits are -/// written to the lower 32 bits of the result. -/// \returns A 128-bit floating-point vector of [4 x float]. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_move_ss(__m128 __a, __m128 __b) -{ - __a[0] = __b[0]; - return __a; -} - -/// Constructs a 128-bit floating-point vector of [4 x float]. The lower -/// 64 bits are set to the upper 64 bits of the second parameter. The upper -/// 64 bits are set to the upper 64 bits of the first parameter. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VUNPCKHPD / UNPCKHPD instruction. -/// -/// \param __a -/// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are -/// written to the upper 64 bits of the result. -/// \param __b -/// A 128-bit floating-point vector of [4 x float]. The upper 64 bits are -/// written to the lower 64 bits of the result. -/// \returns A 128-bit floating-point vector of [4 x float]. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_movehl_ps(__m128 __a, __m128 __b) -{ - return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 6, 7, 2, 3); -} - -/// Constructs a 128-bit floating-point vector of [4 x float]. The lower -/// 64 bits are set to the lower 64 bits of the first parameter. The upper -/// 64 bits are set to the lower 64 bits of the second parameter. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VUNPCKLPD / UNPCKLPD instruction. -/// -/// \param __a -/// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are -/// written to the lower 64 bits of the result. -/// \param __b -/// A 128-bit floating-point vector of [4 x float]. The lower 64 bits are -/// written to the upper 64 bits of the result. -/// \returns A 128-bit floating-point vector of [4 x float]. -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_movelh_ps(__m128 __a, __m128 __b) -{ - return __builtin_shufflevector((__v4sf)__a, (__v4sf)__b, 0, 1, 4, 5); -} - -/// Converts a 64-bit vector of [4 x i16] into a 128-bit vector of [4 x -/// float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the CVTPI2PS + COMPOSITE instruction. -/// -/// \param __a -/// A 64-bit vector of [4 x i16]. The elements of the destination are copied -/// from the corresponding elements in this operand. -/// \returns A 128-bit vector of [4 x float] containing the copied and converted -/// values from the operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX -_mm_cvtpi16_ps(__m64 __a) -{ - __m64 __b, __c; - __m128 __r; - - __b = _mm_setzero_si64(); - __b = _mm_cmpgt_pi16(__b, __a); - __c = _mm_unpackhi_pi16(__a, __b); - __r = _mm_setzero_ps(); - __r = _mm_cvtpi32_ps(__r, __c); - __r = _mm_movelh_ps(__r, __r); - __c = _mm_unpacklo_pi16(__a, __b); - __r = _mm_cvtpi32_ps(__r, __c); - - return __r; -} - -/// Converts a 64-bit vector of 16-bit unsigned integer values into a -/// 128-bit vector of [4 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the CVTPI2PS + COMPOSITE instruction. -/// -/// \param __a -/// A 64-bit vector of 16-bit unsigned integer values. The elements of the -/// destination are copied from the corresponding elements in this operand. -/// \returns A 128-bit vector of [4 x float] containing the copied and converted -/// values from the operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX -_mm_cvtpu16_ps(__m64 __a) -{ - __m64 __b, __c; - __m128 __r; - - __b = _mm_setzero_si64(); - __c = _mm_unpackhi_pi16(__a, __b); - __r = _mm_setzero_ps(); - __r = _mm_cvtpi32_ps(__r, __c); - __r = _mm_movelh_ps(__r, __r); - __c = _mm_unpacklo_pi16(__a, __b); - __r = _mm_cvtpi32_ps(__r, __c); - - return __r; -} - -/// Converts the lower four 8-bit values from a 64-bit vector of [8 x i8] -/// into a 128-bit vector of [4 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the CVTPI2PS + COMPOSITE instruction. -/// -/// \param __a -/// A 64-bit vector of [8 x i8]. The elements of the destination are copied -/// from the corresponding lower 4 elements in this operand. -/// \returns A 128-bit vector of [4 x float] containing the copied and converted -/// values from the operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX -_mm_cvtpi8_ps(__m64 __a) -{ - __m64 __b; - - __b = _mm_setzero_si64(); - __b = _mm_cmpgt_pi8(__b, __a); - __b = _mm_unpacklo_pi8(__a, __b); - - return _mm_cvtpi16_ps(__b); -} - -/// Converts the lower four unsigned 8-bit integer values from a 64-bit -/// vector of [8 x u8] into a 128-bit vector of [4 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the CVTPI2PS + COMPOSITE instruction. -/// -/// \param __a -/// A 64-bit vector of unsigned 8-bit integer values. The elements of the -/// destination are copied from the corresponding lower 4 elements in this -/// operand. -/// \returns A 128-bit vector of [4 x float] containing the copied and converted -/// values from the source operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX -_mm_cvtpu8_ps(__m64 __a) -{ - __m64 __b; - - __b = _mm_setzero_si64(); - __b = _mm_unpacklo_pi8(__a, __b); - - return _mm_cvtpi16_ps(__b); -} - -/// Converts the two 32-bit signed integer values from each 64-bit vector -/// operand of [2 x i32] into a 128-bit vector of [4 x float]. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the CVTPI2PS + COMPOSITE instruction. -/// -/// \param __a -/// A 64-bit vector of [2 x i32]. The lower elements of the destination are -/// copied from the elements in this operand. -/// \param __b -/// A 64-bit vector of [2 x i32]. The upper elements of the destination are -/// copied from the elements in this operand. -/// \returns A 128-bit vector of [4 x float] whose lower 64 bits contain the -/// copied and converted values from the first operand. The upper 64 bits -/// contain the copied and converted values from the second operand. -static __inline__ __m128 __DEFAULT_FN_ATTRS_MMX -_mm_cvtpi32x2_ps(__m64 __a, __m64 __b) -{ - __m128 __c; - - __c = _mm_setzero_ps(); - __c = _mm_cvtpi32_ps(__c, __b); - __c = _mm_movelh_ps(__c, __c); - - return _mm_cvtpi32_ps(__c, __a); -} - -/// Converts each single-precision floating-point element of a 128-bit -/// floating-point vector of [4 x float] into a 16-bit signed integer, and -/// packs the results into a 64-bit integer vector of [4 x i16]. -/// -/// If the floating-point element is NaN or infinity, or if the -/// floating-point element is greater than 0x7FFFFFFF or less than -0x8000, -/// it is converted to 0x8000. Otherwise if the floating-point element is -/// greater than 0x7FFF, it is converted to 0x7FFF. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the CVTPS2PI + COMPOSITE instruction. -/// -/// \param __a -/// A 128-bit floating-point vector of [4 x float]. -/// \returns A 64-bit integer vector of [4 x i16] containing the converted -/// values. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX -_mm_cvtps_pi16(__m128 __a) -{ - __m64 __b, __c; - - __b = _mm_cvtps_pi32(__a); - __a = _mm_movehl_ps(__a, __a); - __c = _mm_cvtps_pi32(__a); - - return _mm_packs_pi32(__b, __c); -} - -/// Converts each single-precision floating-point element of a 128-bit -/// floating-point vector of [4 x float] into an 8-bit signed integer, and -/// packs the results into the lower 32 bits of a 64-bit integer vector of -/// [8 x i8]. The upper 32 bits of the vector are set to 0. -/// -/// If the floating-point element is NaN or infinity, or if the -/// floating-point element is greater than 0x7FFFFFFF or less than -0x80, it -/// is converted to 0x80. Otherwise if the floating-point element is greater -/// than 0x7F, it is converted to 0x7F. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the CVTPS2PI + COMPOSITE instruction. -/// -/// \param __a -/// 128-bit floating-point vector of [4 x float]. -/// \returns A 64-bit integer vector of [8 x i8]. The lower 32 bits contain the -/// converted values and the uppper 32 bits are set to zero. -static __inline__ __m64 __DEFAULT_FN_ATTRS_MMX -_mm_cvtps_pi8(__m128 __a) -{ - __m64 __b, __c; - - __b = _mm_cvtps_pi16(__a); - __c = _mm_setzero_si64(); - - return _mm_packs_pi16(__b, __c); -} - -/// Extracts the sign bits from each single-precision floating-point -/// element of a 128-bit floating-point vector of [4 x float] and returns the -/// sign bits in bits [0:3] of the result. Bits [31:4] of the result are set -/// to zero. -/// -/// \headerfile -/// -/// This intrinsic corresponds to the VMOVMSKPS / MOVMSKPS instruction. -/// -/// \param __a -/// A 128-bit floating-point vector of [4 x float]. -/// \returns A 32-bit integer value. Bits [3:0] contain the sign bits from each -/// single-precision floating-point element of the parameter. Bits [31:4] are -/// set to zero. -static __inline__ int __DEFAULT_FN_ATTRS -_mm_movemask_ps(__m128 __a) -{ - return __builtin_ia32_movmskps((__v4sf)__a); -} - - -#define _MM_ALIGN16 __attribute__((aligned(16))) - -#define _MM_SHUFFLE(z, y, x, w) (((z) << 6) | ((y) << 4) | ((x) << 2) | (w)) - -#define _MM_EXCEPT_INVALID (0x0001U) -#define _MM_EXCEPT_DENORM (0x0002U) -#define _MM_EXCEPT_DIV_ZERO (0x0004U) -#define _MM_EXCEPT_OVERFLOW (0x0008U) -#define _MM_EXCEPT_UNDERFLOW (0x0010U) -#define _MM_EXCEPT_INEXACT (0x0020U) -#define _MM_EXCEPT_MASK (0x003fU) - -#define _MM_MASK_INVALID (0x0080U) -#define _MM_MASK_DENORM (0x0100U) -#define _MM_MASK_DIV_ZERO (0x0200U) -#define _MM_MASK_OVERFLOW (0x0400U) -#define _MM_MASK_UNDERFLOW (0x0800U) -#define _MM_MASK_INEXACT (0x1000U) -#define _MM_MASK_MASK (0x1f80U) - -#define _MM_ROUND_NEAREST (0x0000U) -#define _MM_ROUND_DOWN (0x2000U) -#define _MM_ROUND_UP (0x4000U) -#define _MM_ROUND_TOWARD_ZERO (0x6000U) -#define _MM_ROUND_MASK (0x6000U) - -#define _MM_FLUSH_ZERO_MASK (0x8000U) -#define _MM_FLUSH_ZERO_ON (0x8000U) -#define _MM_FLUSH_ZERO_OFF (0x0000U) - -#define _MM_GET_EXCEPTION_MASK() (_mm_getcsr() & _MM_MASK_MASK) -#define _MM_GET_EXCEPTION_STATE() (_mm_getcsr() & _MM_EXCEPT_MASK) -#define _MM_GET_FLUSH_ZERO_MODE() (_mm_getcsr() & _MM_FLUSH_ZERO_MASK) -#define _MM_GET_ROUNDING_MODE() (_mm_getcsr() & _MM_ROUND_MASK) - -#define _MM_SET_EXCEPTION_MASK(x) (_mm_setcsr((_mm_getcsr() & ~_MM_MASK_MASK) | (x))) -#define _MM_SET_EXCEPTION_STATE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_EXCEPT_MASK) | (x))) -#define _MM_SET_FLUSH_ZERO_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_FLUSH_ZERO_MASK) | (x))) -#define _MM_SET_ROUNDING_MODE(x) (_mm_setcsr((_mm_getcsr() & ~_MM_ROUND_MASK) | (x))) - -#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ -do { \ - __m128 tmp3, tmp2, tmp1, tmp0; \ - tmp0 = _mm_unpacklo_ps((row0), (row1)); \ - tmp2 = _mm_unpacklo_ps((row2), (row3)); \ - tmp1 = _mm_unpackhi_ps((row0), (row1)); \ - tmp3 = _mm_unpackhi_ps((row2), (row3)); \ - (row0) = _mm_movelh_ps(tmp0, tmp2); \ - (row1) = _mm_movehl_ps(tmp2, tmp0); \ - (row2) = _mm_movelh_ps(tmp1, tmp3); \ - (row3) = _mm_movehl_ps(tmp3, tmp1); \ -} while (0) - -/* Aliases for compatibility. */ -#define _m_pextrw _mm_extract_pi16 -#define _m_pinsrw _mm_insert_pi16 -#define _m_pmaxsw _mm_max_pi16 -#define _m_pmaxub _mm_max_pu8 -#define _m_pminsw _mm_min_pi16 -#define _m_pminub _mm_min_pu8 -#define _m_pmovmskb _mm_movemask_pi8 -#define _m_pmulhuw _mm_mulhi_pu16 -#define _m_pshufw _mm_shuffle_pi16 -#define _m_maskmovq _mm_maskmove_si64 -#define _m_pavgb _mm_avg_pu8 -#define _m_pavgw _mm_avg_pu16 -#define _m_psadbw _mm_sad_pu8 -#define _m_ _mm_ -#define _m_ _mm_ - -#undef __DEFAULT_FN_ATTRS -#undef __DEFAULT_FN_ATTRS_MMX - -/* Ugly hack for backwards-compatibility (compatible with gcc) */ -#if defined(__SSE2__) && !__building_module(_Builtin_intrinsics) -#include -#endif - -#endif /* __XMMINTRIN_H */ diff --git a/include/xopintrin.h b/include/xopintrin.h deleted file mode 100644 index 976cdf4..0000000 --- a/include/xopintrin.h +++ /dev/null @@ -1,770 +0,0 @@ -/*===---- xopintrin.h - XOP intrinsics -------------------------------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ - -#ifndef __X86INTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __XOPINTRIN_H -#define __XOPINTRIN_H - -#include - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("xop"), __min_vector_width__(128))) -#define __DEFAULT_FN_ATTRS256 __attribute__((__always_inline__, __nodebug__, __target__("xop"), __min_vector_width__(256))) - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maccs_epi16(__m128i __A, __m128i __B, __m128i __C) -{ - return (__m128i)__builtin_ia32_vpmacssww((__v8hi)__A, (__v8hi)__B, (__v8hi)__C); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_macc_epi16(__m128i __A, __m128i __B, __m128i __C) -{ - return (__m128i)__builtin_ia32_vpmacsww((__v8hi)__A, (__v8hi)__B, (__v8hi)__C); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maccsd_epi16(__m128i __A, __m128i __B, __m128i __C) -{ - return (__m128i)__builtin_ia32_vpmacsswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maccd_epi16(__m128i __A, __m128i __B, __m128i __C) -{ - return (__m128i)__builtin_ia32_vpmacswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maccs_epi32(__m128i __A, __m128i __B, __m128i __C) -{ - return (__m128i)__builtin_ia32_vpmacssdd((__v4si)__A, (__v4si)__B, (__v4si)__C); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_macc_epi32(__m128i __A, __m128i __B, __m128i __C) -{ - return (__m128i)__builtin_ia32_vpmacsdd((__v4si)__A, (__v4si)__B, (__v4si)__C); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maccslo_epi32(__m128i __A, __m128i __B, __m128i __C) -{ - return (__m128i)__builtin_ia32_vpmacssdql((__v4si)__A, (__v4si)__B, (__v2di)__C); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_macclo_epi32(__m128i __A, __m128i __B, __m128i __C) -{ - return (__m128i)__builtin_ia32_vpmacsdql((__v4si)__A, (__v4si)__B, (__v2di)__C); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maccshi_epi32(__m128i __A, __m128i __B, __m128i __C) -{ - return (__m128i)__builtin_ia32_vpmacssdqh((__v4si)__A, (__v4si)__B, (__v2di)__C); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_macchi_epi32(__m128i __A, __m128i __B, __m128i __C) -{ - return (__m128i)__builtin_ia32_vpmacsdqh((__v4si)__A, (__v4si)__B, (__v2di)__C); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maddsd_epi16(__m128i __A, __m128i __B, __m128i __C) -{ - return (__m128i)__builtin_ia32_vpmadcsswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_maddd_epi16(__m128i __A, __m128i __B, __m128i __C) -{ - return (__m128i)__builtin_ia32_vpmadcswd((__v8hi)__A, (__v8hi)__B, (__v4si)__C); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_haddw_epi8(__m128i __A) -{ - return (__m128i)__builtin_ia32_vphaddbw((__v16qi)__A); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_haddd_epi8(__m128i __A) -{ - return (__m128i)__builtin_ia32_vphaddbd((__v16qi)__A); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_haddq_epi8(__m128i __A) -{ - return (__m128i)__builtin_ia32_vphaddbq((__v16qi)__A); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_haddd_epi16(__m128i __A) -{ - return (__m128i)__builtin_ia32_vphaddwd((__v8hi)__A); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_haddq_epi16(__m128i __A) -{ - return (__m128i)__builtin_ia32_vphaddwq((__v8hi)__A); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_haddq_epi32(__m128i __A) -{ - return (__m128i)__builtin_ia32_vphadddq((__v4si)__A); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_haddw_epu8(__m128i __A) -{ - return (__m128i)__builtin_ia32_vphaddubw((__v16qi)__A); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_haddd_epu8(__m128i __A) -{ - return (__m128i)__builtin_ia32_vphaddubd((__v16qi)__A); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_haddq_epu8(__m128i __A) -{ - return (__m128i)__builtin_ia32_vphaddubq((__v16qi)__A); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_haddd_epu16(__m128i __A) -{ - return (__m128i)__builtin_ia32_vphadduwd((__v8hi)__A); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_haddq_epu16(__m128i __A) -{ - return (__m128i)__builtin_ia32_vphadduwq((__v8hi)__A); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_haddq_epu32(__m128i __A) -{ - return (__m128i)__builtin_ia32_vphaddudq((__v4si)__A); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_hsubw_epi8(__m128i __A) -{ - return (__m128i)__builtin_ia32_vphsubbw((__v16qi)__A); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_hsubd_epi16(__m128i __A) -{ - return (__m128i)__builtin_ia32_vphsubwd((__v8hi)__A); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_hsubq_epi32(__m128i __A) -{ - return (__m128i)__builtin_ia32_vphsubdq((__v4si)__A); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_cmov_si128(__m128i __A, __m128i __B, __m128i __C) -{ - return (__m128i)(((__v2du)__A & (__v2du)__C) | ((__v2du)__B & ~(__v2du)__C)); -} - -static __inline__ __m256i __DEFAULT_FN_ATTRS256 -_mm256_cmov_si256(__m256i __A, __m256i __B, __m256i __C) -{ - return (__m256i)(((__v4du)__A & (__v4du)__C) | ((__v4du)__B & ~(__v4du)__C)); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_perm_epi8(__m128i __A, __m128i __B, __m128i __C) -{ - return (__m128i)__builtin_ia32_vpperm((__v16qi)__A, (__v16qi)__B, (__v16qi)__C); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_rot_epi8(__m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_vprotb((__v16qi)__A, (__v16qi)__B); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_rot_epi16(__m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_vprotw((__v8hi)__A, (__v8hi)__B); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_rot_epi32(__m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_vprotd((__v4si)__A, (__v4si)__B); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_rot_epi64(__m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_vprotq((__v2di)__A, (__v2di)__B); -} - -#define _mm_roti_epi8(A, N) \ - ((__m128i)__builtin_ia32_vprotbi((__v16qi)(__m128i)(A), (N))) - -#define _mm_roti_epi16(A, N) \ - ((__m128i)__builtin_ia32_vprotwi((__v8hi)(__m128i)(A), (N))) - -#define _mm_roti_epi32(A, N) \ - ((__m128i)__builtin_ia32_vprotdi((__v4si)(__m128i)(A), (N))) - -#define _mm_roti_epi64(A, N) \ - ((__m128i)__builtin_ia32_vprotqi((__v2di)(__m128i)(A), (N))) - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_shl_epi8(__m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_vpshlb((__v16qi)__A, (__v16qi)__B); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_shl_epi16(__m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_vpshlw((__v8hi)__A, (__v8hi)__B); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_shl_epi32(__m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_vpshld((__v4si)__A, (__v4si)__B); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_shl_epi64(__m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_vpshlq((__v2di)__A, (__v2di)__B); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_sha_epi8(__m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_vpshab((__v16qi)__A, (__v16qi)__B); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_sha_epi16(__m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_vpshaw((__v8hi)__A, (__v8hi)__B); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_sha_epi32(__m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_vpshad((__v4si)__A, (__v4si)__B); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_sha_epi64(__m128i __A, __m128i __B) -{ - return (__m128i)__builtin_ia32_vpshaq((__v2di)__A, (__v2di)__B); -} - -#define _mm_com_epu8(A, B, N) \ - ((__m128i)__builtin_ia32_vpcomub((__v16qi)(__m128i)(A), \ - (__v16qi)(__m128i)(B), (N))) - -#define _mm_com_epu16(A, B, N) \ - ((__m128i)__builtin_ia32_vpcomuw((__v8hi)(__m128i)(A), \ - (__v8hi)(__m128i)(B), (N))) - -#define _mm_com_epu32(A, B, N) \ - ((__m128i)__builtin_ia32_vpcomud((__v4si)(__m128i)(A), \ - (__v4si)(__m128i)(B), (N))) - -#define _mm_com_epu64(A, B, N) \ - ((__m128i)__builtin_ia32_vpcomuq((__v2di)(__m128i)(A), \ - (__v2di)(__m128i)(B), (N))) - -#define _mm_com_epi8(A, B, N) \ - ((__m128i)__builtin_ia32_vpcomb((__v16qi)(__m128i)(A), \ - (__v16qi)(__m128i)(B), (N))) - -#define _mm_com_epi16(A, B, N) \ - ((__m128i)__builtin_ia32_vpcomw((__v8hi)(__m128i)(A), \ - (__v8hi)(__m128i)(B), (N))) - -#define _mm_com_epi32(A, B, N) \ - ((__m128i)__builtin_ia32_vpcomd((__v4si)(__m128i)(A), \ - (__v4si)(__m128i)(B), (N))) - -#define _mm_com_epi64(A, B, N) \ - ((__m128i)__builtin_ia32_vpcomq((__v2di)(__m128i)(A), \ - (__v2di)(__m128i)(B), (N))) - -#define _MM_PCOMCTRL_LT 0 -#define _MM_PCOMCTRL_LE 1 -#define _MM_PCOMCTRL_GT 2 -#define _MM_PCOMCTRL_GE 3 -#define _MM_PCOMCTRL_EQ 4 -#define _MM_PCOMCTRL_NEQ 5 -#define _MM_PCOMCTRL_FALSE 6 -#define _MM_PCOMCTRL_TRUE 7 - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comlt_epu8(__m128i __A, __m128i __B) -{ - return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_LT); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comle_epu8(__m128i __A, __m128i __B) -{ - return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_LE); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comgt_epu8(__m128i __A, __m128i __B) -{ - return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_GT); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comge_epu8(__m128i __A, __m128i __B) -{ - return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_GE); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comeq_epu8(__m128i __A, __m128i __B) -{ - return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_EQ); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comneq_epu8(__m128i __A, __m128i __B) -{ - return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_NEQ); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comfalse_epu8(__m128i __A, __m128i __B) -{ - return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_FALSE); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comtrue_epu8(__m128i __A, __m128i __B) -{ - return _mm_com_epu8(__A, __B, _MM_PCOMCTRL_TRUE); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comlt_epu16(__m128i __A, __m128i __B) -{ - return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_LT); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comle_epu16(__m128i __A, __m128i __B) -{ - return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_LE); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comgt_epu16(__m128i __A, __m128i __B) -{ - return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_GT); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comge_epu16(__m128i __A, __m128i __B) -{ - return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_GE); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comeq_epu16(__m128i __A, __m128i __B) -{ - return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_EQ); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comneq_epu16(__m128i __A, __m128i __B) -{ - return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_NEQ); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comfalse_epu16(__m128i __A, __m128i __B) -{ - return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_FALSE); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comtrue_epu16(__m128i __A, __m128i __B) -{ - return _mm_com_epu16(__A, __B, _MM_PCOMCTRL_TRUE); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comlt_epu32(__m128i __A, __m128i __B) -{ - return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_LT); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comle_epu32(__m128i __A, __m128i __B) -{ - return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_LE); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comgt_epu32(__m128i __A, __m128i __B) -{ - return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_GT); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comge_epu32(__m128i __A, __m128i __B) -{ - return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_GE); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comeq_epu32(__m128i __A, __m128i __B) -{ - return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_EQ); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comneq_epu32(__m128i __A, __m128i __B) -{ - return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_NEQ); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comfalse_epu32(__m128i __A, __m128i __B) -{ - return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_FALSE); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comtrue_epu32(__m128i __A, __m128i __B) -{ - return _mm_com_epu32(__A, __B, _MM_PCOMCTRL_TRUE); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comlt_epu64(__m128i __A, __m128i __B) -{ - return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_LT); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comle_epu64(__m128i __A, __m128i __B) -{ - return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_LE); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comgt_epu64(__m128i __A, __m128i __B) -{ - return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_GT); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comge_epu64(__m128i __A, __m128i __B) -{ - return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_GE); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comeq_epu64(__m128i __A, __m128i __B) -{ - return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_EQ); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comneq_epu64(__m128i __A, __m128i __B) -{ - return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_NEQ); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comfalse_epu64(__m128i __A, __m128i __B) -{ - return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_FALSE); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comtrue_epu64(__m128i __A, __m128i __B) -{ - return _mm_com_epu64(__A, __B, _MM_PCOMCTRL_TRUE); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comlt_epi8(__m128i __A, __m128i __B) -{ - return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_LT); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comle_epi8(__m128i __A, __m128i __B) -{ - return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_LE); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comgt_epi8(__m128i __A, __m128i __B) -{ - return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_GT); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comge_epi8(__m128i __A, __m128i __B) -{ - return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_GE); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comeq_epi8(__m128i __A, __m128i __B) -{ - return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_EQ); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comneq_epi8(__m128i __A, __m128i __B) -{ - return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_NEQ); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comfalse_epi8(__m128i __A, __m128i __B) -{ - return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_FALSE); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comtrue_epi8(__m128i __A, __m128i __B) -{ - return _mm_com_epi8(__A, __B, _MM_PCOMCTRL_TRUE); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comlt_epi16(__m128i __A, __m128i __B) -{ - return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_LT); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comle_epi16(__m128i __A, __m128i __B) -{ - return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_LE); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comgt_epi16(__m128i __A, __m128i __B) -{ - return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_GT); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comge_epi16(__m128i __A, __m128i __B) -{ - return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_GE); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comeq_epi16(__m128i __A, __m128i __B) -{ - return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_EQ); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comneq_epi16(__m128i __A, __m128i __B) -{ - return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_NEQ); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comfalse_epi16(__m128i __A, __m128i __B) -{ - return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_FALSE); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comtrue_epi16(__m128i __A, __m128i __B) -{ - return _mm_com_epi16(__A, __B, _MM_PCOMCTRL_TRUE); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comlt_epi32(__m128i __A, __m128i __B) -{ - return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_LT); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comle_epi32(__m128i __A, __m128i __B) -{ - return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_LE); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comgt_epi32(__m128i __A, __m128i __B) -{ - return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_GT); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comge_epi32(__m128i __A, __m128i __B) -{ - return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_GE); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comeq_epi32(__m128i __A, __m128i __B) -{ - return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_EQ); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comneq_epi32(__m128i __A, __m128i __B) -{ - return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_NEQ); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comfalse_epi32(__m128i __A, __m128i __B) -{ - return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_FALSE); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comtrue_epi32(__m128i __A, __m128i __B) -{ - return _mm_com_epi32(__A, __B, _MM_PCOMCTRL_TRUE); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comlt_epi64(__m128i __A, __m128i __B) -{ - return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_LT); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comle_epi64(__m128i __A, __m128i __B) -{ - return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_LE); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comgt_epi64(__m128i __A, __m128i __B) -{ - return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_GT); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comge_epi64(__m128i __A, __m128i __B) -{ - return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_GE); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comeq_epi64(__m128i __A, __m128i __B) -{ - return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_EQ); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comneq_epi64(__m128i __A, __m128i __B) -{ - return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_NEQ); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comfalse_epi64(__m128i __A, __m128i __B) -{ - return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_FALSE); -} - -static __inline__ __m128i __DEFAULT_FN_ATTRS -_mm_comtrue_epi64(__m128i __A, __m128i __B) -{ - return _mm_com_epi64(__A, __B, _MM_PCOMCTRL_TRUE); -} - -#define _mm_permute2_pd(X, Y, C, I) \ - ((__m128d)__builtin_ia32_vpermil2pd((__v2df)(__m128d)(X), \ - (__v2df)(__m128d)(Y), \ - (__v2di)(__m128i)(C), (I))) - -#define _mm256_permute2_pd(X, Y, C, I) \ - ((__m256d)__builtin_ia32_vpermil2pd256((__v4df)(__m256d)(X), \ - (__v4df)(__m256d)(Y), \ - (__v4di)(__m256i)(C), (I))) - -#define _mm_permute2_ps(X, Y, C, I) \ - ((__m128)__builtin_ia32_vpermil2ps((__v4sf)(__m128)(X), (__v4sf)(__m128)(Y), \ - (__v4si)(__m128i)(C), (I))) - -#define _mm256_permute2_ps(X, Y, C, I) \ - ((__m256)__builtin_ia32_vpermil2ps256((__v8sf)(__m256)(X), \ - (__v8sf)(__m256)(Y), \ - (__v8si)(__m256i)(C), (I))) - -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_frcz_ss(__m128 __A) -{ - return (__m128)__builtin_ia32_vfrczss((__v4sf)__A); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_frcz_sd(__m128d __A) -{ - return (__m128d)__builtin_ia32_vfrczsd((__v2df)__A); -} - -static __inline__ __m128 __DEFAULT_FN_ATTRS -_mm_frcz_ps(__m128 __A) -{ - return (__m128)__builtin_ia32_vfrczps((__v4sf)__A); -} - -static __inline__ __m128d __DEFAULT_FN_ATTRS -_mm_frcz_pd(__m128d __A) -{ - return (__m128d)__builtin_ia32_vfrczpd((__v2df)__A); -} - -static __inline__ __m256 __DEFAULT_FN_ATTRS256 -_mm256_frcz_ps(__m256 __A) -{ - return (__m256)__builtin_ia32_vfrczps256((__v8sf)__A); -} - -static __inline__ __m256d __DEFAULT_FN_ATTRS256 -_mm256_frcz_pd(__m256d __A) -{ - return (__m256d)__builtin_ia32_vfrczpd256((__v4df)__A); -} - -#undef __DEFAULT_FN_ATTRS -#undef __DEFAULT_FN_ATTRS256 - -#endif /* __XOPINTRIN_H */ diff --git a/include/xsavecintrin.h b/include/xsavecintrin.h deleted file mode 100644 index 5524947..0000000 --- a/include/xsavecintrin.h +++ /dev/null @@ -1,34 +0,0 @@ -/*===---- xsavecintrin.h - XSAVEC intrinsic --------------------------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ - -#ifndef __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __XSAVECINTRIN_H -#define __XSAVECINTRIN_H - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("xsavec"))) - -static __inline__ void __DEFAULT_FN_ATTRS -_xsavec(void *__p, unsigned long long __m) { - __builtin_ia32_xsavec(__p, __m); -} - -#ifdef __x86_64__ -static __inline__ void __DEFAULT_FN_ATTRS -_xsavec64(void *__p, unsigned long long __m) { - __builtin_ia32_xsavec64(__p, __m); -} -#endif - -#undef __DEFAULT_FN_ATTRS - -#endif diff --git a/include/xsaveintrin.h b/include/xsaveintrin.h deleted file mode 100644 index 9429db6..0000000 --- a/include/xsaveintrin.h +++ /dev/null @@ -1,63 +0,0 @@ -/*===---- xsaveintrin.h - XSAVE intrinsic ----------------------------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ - -#ifndef __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __XSAVEINTRIN_H -#define __XSAVEINTRIN_H - -#ifdef _MSC_VER -#define _XCR_XFEATURE_ENABLED_MASK 0 -#endif - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("xsave"))) - -static __inline__ void __DEFAULT_FN_ATTRS -_xsave(void *__p, unsigned long long __m) { - __builtin_ia32_xsave(__p, __m); -} - -static __inline__ void __DEFAULT_FN_ATTRS -_xrstor(void *__p, unsigned long long __m) { - __builtin_ia32_xrstor(__p, __m); -} - -#ifndef _MSC_VER -#define _xgetbv(A) __builtin_ia32_xgetbv((long long)(A)) -#define _xsetbv(A, B) __builtin_ia32_xsetbv((unsigned int)(A), (unsigned long long)(B)) -#else -#ifdef __cplusplus -extern "C" { -#endif -unsigned __int64 __cdecl _xgetbv(unsigned int); -void __cdecl _xsetbv(unsigned int, unsigned __int64); -#ifdef __cplusplus -} -#endif -#endif /* _MSC_VER */ - -#ifdef __x86_64__ -static __inline__ void __DEFAULT_FN_ATTRS -_xsave64(void *__p, unsigned long long __m) { - __builtin_ia32_xsave64(__p, __m); -} - -static __inline__ void __DEFAULT_FN_ATTRS -_xrstor64(void *__p, unsigned long long __m) { - __builtin_ia32_xrstor64(__p, __m); -} - -#endif - -#undef __DEFAULT_FN_ATTRS - -#endif diff --git a/include/xsaveoptintrin.h b/include/xsaveoptintrin.h deleted file mode 100644 index 89a4c44..0000000 --- a/include/xsaveoptintrin.h +++ /dev/null @@ -1,34 +0,0 @@ -/*===---- xsaveoptintrin.h - XSAVEOPT intrinsic ----------------------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ - -#ifndef __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __XSAVEOPTINTRIN_H -#define __XSAVEOPTINTRIN_H - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("xsaveopt"))) - -static __inline__ void __DEFAULT_FN_ATTRS -_xsaveopt(void *__p, unsigned long long __m) { - __builtin_ia32_xsaveopt(__p, __m); -} - -#ifdef __x86_64__ -static __inline__ void __DEFAULT_FN_ATTRS -_xsaveopt64(void *__p, unsigned long long __m) { - __builtin_ia32_xsaveopt64(__p, __m); -} -#endif - -#undef __DEFAULT_FN_ATTRS - -#endif diff --git a/include/xsavesintrin.h b/include/xsavesintrin.h deleted file mode 100644 index 3f99219..0000000 --- a/include/xsavesintrin.h +++ /dev/null @@ -1,44 +0,0 @@ -/*===---- xsavesintrin.h - XSAVES intrinsic --------------------------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ - -#ifndef __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __XSAVESINTRIN_H -#define __XSAVESINTRIN_H - -/* Define the default attributes for the functions in this file. */ -#define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("xsaves"))) - -static __inline__ void __DEFAULT_FN_ATTRS -_xsaves(void *__p, unsigned long long __m) { - __builtin_ia32_xsaves(__p, __m); -} - -static __inline__ void __DEFAULT_FN_ATTRS -_xrstors(void *__p, unsigned long long __m) { - __builtin_ia32_xrstors(__p, __m); -} - -#ifdef __x86_64__ -static __inline__ void __DEFAULT_FN_ATTRS -_xrstors64(void *__p, unsigned long long __m) { - __builtin_ia32_xrstors64(__p, __m); -} - -static __inline__ void __DEFAULT_FN_ATTRS -_xsaves64(void *__p, unsigned long long __m) { - __builtin_ia32_xsaves64(__p, __m); -} -#endif - -#undef __DEFAULT_FN_ATTRS - -#endif diff --git a/include/xtestintrin.h b/include/xtestintrin.h deleted file mode 100644 index 7d19e37..0000000 --- a/include/xtestintrin.h +++ /dev/null @@ -1,27 +0,0 @@ -/*===---- xtestintrin.h - XTEST intrinsic ----------------------------------=== - * - * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. - * See https://llvm.org/LICENSE.txt for license information. - * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - * - *===-----------------------------------------------------------------------=== - */ - -#ifndef __IMMINTRIN_H -#error "Never use directly; include instead." -#endif - -#ifndef __XTESTINTRIN_H -#define __XTESTINTRIN_H - -/* xtest returns non-zero if the instruction is executed within an RTM or active - * HLE region. */ -/* FIXME: This can be an either or for RTM/HLE. Deal with this when HLE is - * supported. */ -static __inline__ int - __attribute__((__always_inline__, __nodebug__, __target__("rtm"))) - _xtest(void) { - return __builtin_ia32_xtest(); -} - -#endif